From b04e30022ce13f4c8bb6f9142599f676c4846a6d Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Thu, 5 Dec 2024 22:41:12 +0530 Subject: [PATCH 01/10] Formatting and linting with ruff --- gyandex/cli/podgen.py | 24 +- gyandex/llms/factory.py | 9 +- gyandex/llms/factory_test.py | 18 +- gyandex/loaders/factory.py | 14 +- gyandex/loaders/factory_test.py | 19 +- gyandex/podgen/config/loader.py | 5 +- gyandex/podgen/config/loader_test.py | 19 +- gyandex/podgen/config/schema.py | 4 +- gyandex/podgen/engine/publisher.py | 23 +- gyandex/podgen/engine/publisher_test.py | 10 +- gyandex/podgen/feed/generator.py | 8 +- gyandex/podgen/feed/generator_test.py | 9 +- gyandex/podgen/feed/models.py | 17 +- gyandex/podgen/speech/google_cloud.py | 17 +- gyandex/podgen/speech/google_cloud_test.py | 14 +- gyandex/podgen/storage/factory.py | 1 + gyandex/podgen/storage/factory_test.py | 8 +- gyandex/podgen/storage/s3.py | 8 +- gyandex/podgen/storage/s3_test.py | 12 +- gyandex/podgen/workflows/alexandria.py | 59 +-- gyandex/podgen/workflows/types.py | 8 +- main.ipynb | 412 --------------------- poetry.lock | 29 +- publish.ipynb | 119 ------ pyproject.toml | 13 +- 25 files changed, 178 insertions(+), 701 deletions(-) delete mode 100644 main.ipynb delete mode 100644 publish.ipynb diff --git a/gyandex/cli/podgen.py b/gyandex/cli/podgen.py index aa207f7..a20b13e 100644 --- a/gyandex/cli/podgen.py +++ b/gyandex/cli/podgen.py @@ -6,7 +6,6 @@ from dotenv import load_dotenv from rich.console import Console -from gyandex.llms.factory import get_model from gyandex.loaders.factory import load_content from gyandex.podgen.engine.publisher import PodcastPublisher, PodcastMetadata from gyandex.podgen.feed.models import PodcastDB @@ -31,18 +30,18 @@ def main(): config = load_config(args.config_path) # Load the content - with console.status('[bold green] Loading content...[/bold green]'): + with console.status("[bold green] Loading content...[/bold green]"): document = load_content(config.content) - console.log('Content loaded...') + console.log("Content loaded...") # Analyze the content - with console.status('[bold green] Crafting the script...[/bold green]'): + with console.status("[bold green] Crafting the script...[/bold green]"): workflow = get_workflow(config) script = asyncio.run(workflow.generate_script(document)) console.log(f'Script completed for "{script.title}". Script contains {len(script.dialogues)} segments...') # Generate the podcast audio - with console.status('[bold green] Generating audio...[/bold green]'): + with console.status("[bold green] Generating audio...[/bold green]"): tts_engine = get_text_to_speech_engine(config.tts) audio_segments = [tts_engine.process_segment(dialogue) for dialogue in script.dialogues] @@ -52,17 +51,18 @@ def main(): podcast_path = f"{output_dir}/podcast_{hashlib.md5(config.content.source.encode()).hexdigest()}.mp3" tts_engine.generate_audio_file(audio_segments, podcast_path) - console.log(f'Podcast file {podcast_path} generated...') + console.log(f"Podcast file {podcast_path} generated...") - with console.status('[bold green] Publishing podcast...[/bold green]'): + with console.status("[bold green] Publishing podcast...[/bold green]"): storage = get_storage(config.storage) - db = PodcastDB(db_path='assets/podcasts.db') + db = PodcastDB(db_path="assets/podcasts.db") publisher = PodcastPublisher( storage=storage, db=db, - base_url=f"https://{storage.custom_domain}", # @FIXME: we need to fallback when custom domain is not available + # @FIXME: we need to fallback when custom domain is not available + base_url=f"https://{storage.custom_domain}", ) - feed_url = publisher.create_feed( + publisher.create_feed( slug=config.feed.slug, title=config.feed.title, email=config.feed.email, @@ -73,14 +73,14 @@ def main(): language=config.feed.language, categories=",".join(config.feed.categories), ) - console.log('Uploading episode...') + console.log("Uploading episode...") urls = publisher.add_episode( feed_slug=config.feed.slug, audio_file_path=podcast_path, metadata=PodcastMetadata( title=script.title, description=script.description, - ) + ), ) console.print(f"Feed published at {urls['feed_url']}") console.print(f"Episode published at {urls['episode_url']}") diff --git a/gyandex/llms/factory.py b/gyandex/llms/factory.py index 40bf559..f51004f 100644 --- a/gyandex/llms/factory.py +++ b/gyandex/llms/factory.py @@ -10,16 +10,16 @@ class LLMLoggingCallback(BaseCallbackHandler): def __init__(self, log_dir="assets"): - logger = logging.getLogger('llm_logger') + logger = logging.getLogger("llm_logger") logger.setLevel(logging.INFO) # Create file handler with timestamp in filename - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - fh = logging.FileHandler(f'{log_dir}/llm_logs_{timestamp}.log') + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + fh = logging.FileHandler(f"{log_dir}/llm_logs_{timestamp}.log") fh.setLevel(logging.INFO) # Create formatter - formatter = logging.Formatter('%(asctime)s - %(message)s') + formatter = logging.Formatter("%(asctime)s - %(message)s") fh.setFormatter(formatter) logger.addHandler(fh) @@ -38,6 +38,7 @@ def on_llm_end(self, response, **kwargs): def on_llm_error(self, error, **kwargs): self.logger.error(f"\n=== ERROR ===\n{str(error)}\n") + # @TODO: Centralize this argument type in a single place def get_model(config: Union[GoogleGenerativeAILLMConfig], log_dir="assets"): if config.provider == "google-generative-ai": diff --git a/gyandex/llms/factory_test.py b/gyandex/llms/factory_test.py index 2521a38..4e5ba2d 100644 --- a/gyandex/llms/factory_test.py +++ b/gyandex/llms/factory_test.py @@ -1,20 +1,16 @@ import pytest -from unittest.mock import Mock, patch -from datetime import datetime from langchain_google_genai import GoogleGenerativeAI from pydantic import ValidationError -from gyandex.llms.factory import get_model, LLMLoggingCallback +from gyandex.llms.factory import get_model from gyandex.podgen.config.schema import GoogleGenerativeAILLMConfig + def test_get_model_returns_google_generative_ai(): """Tests that get_model creates a GoogleGenerativeAI instance with correct config""" # Given config = GoogleGenerativeAILLMConfig( - provider="google-generative-ai", - model="gemini-pro", - temperature=0.7, - google_api_key="test-key" + provider="google-generative-ai", model="gemini-pro", temperature=0.7, google_api_key="test-key" ) # When @@ -25,13 +21,11 @@ def test_get_model_returns_google_generative_ai(): assert model.model == "gemini-pro" assert model.temperature == 0.7 + def test_get_model_raises_for_unsupported_provider(): """Tests that get_model raises NotImplementedError for unsupported providers""" # When/Then with pytest.raises(ValidationError): - config = GoogleGenerativeAILLMConfig( - provider="unsupported", - model="test", - temperature=0.5, - google_api_key="test-key" + _ = GoogleGenerativeAILLMConfig( + provider="unsupported", model="test", temperature=0.5, google_api_key="test-key" ) diff --git a/gyandex/loaders/factory.py b/gyandex/loaders/factory.py index 157292e..ccb32e3 100644 --- a/gyandex/loaders/factory.py +++ b/gyandex/loaders/factory.py @@ -20,11 +20,15 @@ def load_content(content_config: ContentConfig) -> Document: def fetch_url(url) -> Document: - headers = { "Accept": "application/json" } + headers = {"Accept": "application/json"} response = requests.get(f"https://r.jina.ai/{url}", headers=headers) # @TODO: Add error handling content = response.json() - return Document(title=content['data']['title'], content=content['data']['content'], metadata={ - 'url': content['data']['url'], - 'description': content['data']['description'], - }) + return Document( + title=content["data"]["title"], + content=content["data"]["content"], + metadata={ + "url": content["data"]["url"], + "description": content["data"]["description"], + }, + ) diff --git a/gyandex/loaders/factory_test.py b/gyandex/loaders/factory_test.py index 19d7010..0bea739 100644 --- a/gyandex/loaders/factory_test.py +++ b/gyandex/loaders/factory_test.py @@ -1,6 +1,3 @@ -import json - -import pytest import responses from gyandex.loaders.factory import fetch_url @@ -11,12 +8,7 @@ def test_fetch_url_returns_json_response(): # Given test_url = "test123" actual = {"data": {"title": "title", "content": "test content", "url": "url", "description": "description"}} - responses.add( - responses.GET, - f"https://r.jina.ai/{test_url}", - json=actual, - status=200 - ) + responses.add(responses.GET, f"https://r.jina.ai/{test_url}", json=actual, status=200) # When result = fetch_url(test_url) @@ -24,7 +16,7 @@ def test_fetch_url_returns_json_response(): # Then assert result.content == "test content" assert result.title == "title" - assert result.metadata == { "url": "url", "description": "description" } + assert result.metadata == {"url": "url", "description": "description"} @responses.activate @@ -32,12 +24,12 @@ def test_fetch_url_sends_correct_headers(): """Tests that fetch_url sends the correct Accept header""" # Given test_url = "test123" - expected_headers = {"Accept": "application/json"} + _ = {"Accept": "application/json"} responses.add( responses.GET, f"https://r.jina.ai/{test_url}", json={"data": {"title": "title", "content": "test content", "url": "url", "description": "description"}}, - status=200 + status=200, ) # When @@ -46,6 +38,7 @@ def test_fetch_url_sends_correct_headers(): # Then assert responses.calls[0].request.headers["Accept"] == "application/json" + @responses.activate def test_fetch_url_constructs_correct_url(): """Tests that fetch_url constructs the correct URL with the base and provided path""" @@ -56,7 +49,7 @@ def test_fetch_url_constructs_correct_url(): responses.GET, expected_url, json={"data": {"title": "title", "content": "test content", "url": "url", "description": "description"}}, - status=200 + status=200, ) # When diff --git a/gyandex/podgen/config/loader.py b/gyandex/podgen/config/loader.py index 8875818..720a947 100644 --- a/gyandex/podgen/config/loader.py +++ b/gyandex/podgen/config/loader.py @@ -3,12 +3,13 @@ import yaml from .schema import PodcastConfig + def resolve_env_vars(value: str) -> str: """Resolve ${ENV_VAR} patterns in string values""" if not isinstance(value, str): return value - pattern = r'\${([^}^{]+)}' + pattern = r"\${([^}^{]+)}" matches = re.finditer(pattern, value) for match in matches: @@ -20,6 +21,7 @@ def resolve_env_vars(value: str) -> str: return value + def resolve_nested_env_vars(data): """Recursively resolve environment variables in nested structures""" if isinstance(data, dict): @@ -29,6 +31,7 @@ def resolve_nested_env_vars(data): else: return resolve_env_vars(data) + def load_config(config_path: str) -> PodcastConfig: """Load and parse YAML config with environment variable support""" with open(config_path) as f: diff --git a/gyandex/podgen/config/loader_test.py b/gyandex/podgen/config/loader_test.py index 0c06b96..0f9fa39 100644 --- a/gyandex/podgen/config/loader_test.py +++ b/gyandex/podgen/config/loader_test.py @@ -3,6 +3,7 @@ from .loader import resolve_env_vars, resolve_nested_env_vars, load_config from .schema import PodcastConfig + def test_resolve_env_vars_replaces_single_variable(): """Test that resolve_env_vars replaces a single environment variable in a string""" # Given @@ -15,6 +16,7 @@ def test_resolve_env_vars_replaces_single_variable(): # Then assert result == "prefix_test_value_suffix" + def test_resolve_env_vars_handles_multiple_variables(): """Test that resolve_env_vars replaces multiple environment variables in a string""" # Given @@ -28,6 +30,7 @@ def test_resolve_env_vars_handles_multiple_variables(): # Then assert result == "first_middle_second" + def test_resolve_env_vars_raises_on_missing_variable(): """Test that resolve_env_vars raises ValueError when environment variable is not found""" # Given @@ -37,27 +40,19 @@ def test_resolve_env_vars_raises_on_missing_variable(): with pytest.raises(ValueError, match="Environment variable NONEXISTENT_VAR not found"): resolve_env_vars(input_string) + def test_resolve_nested_env_vars_handles_dict(): """Test that resolve_nested_env_vars resolves variables in nested dictionary""" # Given os.environ["NESTED_VAR"] = "value" - input_dict = { - "key1": "${NESTED_VAR}", - "key2": { - "nested_key": "${NESTED_VAR}" - } - } + input_dict = {"key1": "${NESTED_VAR}", "key2": {"nested_key": "${NESTED_VAR}"}} # When result = resolve_nested_env_vars(input_dict) # Then - assert result == { - "key1": "value", - "key2": { - "nested_key": "value" - } - } + assert result == {"key1": "value", "key2": {"nested_key": "value"}} + def test_load_config_parses_yaml_with_env_vars(tmp_path): """Test that load_config properly loads YAML and resolves environment variables""" diff --git a/gyandex/podgen/config/schema.py b/gyandex/podgen/config/schema.py index dc99d89..908f58e 100644 --- a/gyandex/podgen/config/schema.py +++ b/gyandex/podgen/config/schema.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Dict, List, Optional, Union, Literal +from typing import List, Optional, Union, Literal from pydantic import BaseModel, HttpUrl, Field @@ -46,7 +46,7 @@ class Participant(BaseModel): name: str voice: str gender: Gender - personality: Optional[str] = '' + personality: Optional[str] = "" language_code: Optional[str] = "en-US" diff --git a/gyandex/podgen/engine/publisher.py b/gyandex/podgen/engine/publisher.py index e90e710..e460b92 100644 --- a/gyandex/podgen/engine/publisher.py +++ b/gyandex/podgen/engine/publisher.py @@ -10,6 +10,7 @@ from ..storage.s3 import S3CompatibleStorage from ..feed.models import PodcastDB, Episode + # @TODO: Look at URL manipulation and how URLs are used between storage # and feeds. There is possibly some duplication here. @dataclass @@ -55,12 +56,8 @@ def _get_audio_metadata(self, file_path: str) -> Dict[str, Any]: metadata = {} if audio is not None: - metadata["duration"] = ( - int(audio.info.length) if hasattr(audio.info, "length") else None - ) - metadata["mime_type"] = ( - audio.mime[0] if hasattr(audio, "mime") and audio.mime else None - ) + metadata["duration"] = int(audio.info.length) if hasattr(audio.info, "length") else None + metadata["mime_type"] = audio.mime[0] if hasattr(audio, "mime") and audio.mime else None metadata["file_size"] = os.path.getsize(file_path) return metadata @@ -71,9 +68,7 @@ def _generate_guid(self, feed_slug: str, file_path: str) -> str: file_hash = hashlib.md5(f.read()).hexdigest() return f"{feed_slug}-{file_hash}" - def add_episode( - self, feed_slug: str, audio_file_path: str, metadata: PodcastMetadata - ) -> Dict[str, str]: + def add_episode(self, feed_slug: str, audio_file_path: str, metadata: PodcastMetadata) -> Dict[str, str]: """ Add a new episode to a feed. @@ -110,7 +105,7 @@ def add_episode( ) # Add episode to database - episode = self.db.add_episode( + _ = self.db.add_episode( feed_slug=feed_slug, title=metadata.title, description=metadata.description, @@ -143,9 +138,7 @@ def _save_temp_feed(self, feed_content: str) -> str: f.write(feed_content) return temp_path - def create_feed( - self, slug: str, title: str, description: str, author: str, email: str, **kwargs - ) -> str: + def create_feed(self, slug: str, title: str, description: str, author: str, email: str, **kwargs) -> str: """ Create a new podcast feed. @@ -189,8 +182,6 @@ def get_feed_url(self, feed_slug: str) -> str: """Get the URL for a feed.""" return urljoin(self.base_url, f"{self.feed_prefix}/{feed_slug}.xml") - def list_episodes( - self, feed_slug: str, limit: Optional[int] = None - ) -> list[Type[Episode]]: + def list_episodes(self, feed_slug: str, limit: Optional[int] = None) -> list[Type[Episode]]: """List episodes in a feed.""" return self.db.get_episodes(feed_slug, limit) diff --git a/gyandex/podgen/engine/publisher_test.py b/gyandex/podgen/engine/publisher_test.py index 3b54311..a9906c1 100644 --- a/gyandex/podgen/engine/publisher_test.py +++ b/gyandex/podgen/engine/publisher_test.py @@ -1,9 +1,6 @@ import pytest -from datetime import datetime from unittest.mock import Mock, patch -import os from .publisher import PodcastPublisher, PodcastMetadata -from ..feed.models_test import test_db # @todo: move to common fixtures from ..storage.s3 import S3CompatibleStorage @@ -85,7 +82,8 @@ def test_add_episode(orchestrator, mock_storage, sample_audio, mock_mutagen): mock_storage.upload_file.reset_mock() metadata = PodcastMetadata( - title="Test Episode", description="Test Episode Description", + title="Test Episode", + description="Test Episode Description", ) mock_storage.upload_file.side_effect = [ @@ -109,9 +107,7 @@ def test_add_episode_to_nonexistent_feed(orchestrator, sample_audio): Then: ValueError should be raised """ # Given - metadata = PodcastMetadata( - title="Test Episode", description="Test Episode Description" - ) + metadata = PodcastMetadata(title="Test Episode", description="Test Episode Description") # When/Then with pytest.raises(ValueError): diff --git a/gyandex/podgen/feed/generator.py b/gyandex/podgen/feed/generator.py index 75e8837..fc7cc85 100644 --- a/gyandex/podgen/feed/generator.py +++ b/gyandex/podgen/feed/generator.py @@ -38,9 +38,7 @@ def generate_feed(self, slug: str) -> str: fg.image(feed_data.image_url) # iTunes specific tags - fg.podcast.itunes_category( - feed_data.categories.split(",")[0] if feed_data.categories else "Technology" - ) + fg.podcast.itunes_category(feed_data.categories.split(",")[0] if feed_data.categories else "Technology") fg.podcast.itunes_explicit(feed_data.explicit) fg.podcast.itunes_author(feed_data.author) fg.podcast.itunes_owner(name=feed_data.author, email=feed_data.email) @@ -61,9 +59,7 @@ def generate_feed(self, slug: str) -> str: fe.enclosure(episode.audio_url, str(episode.file_size), episode.mime_type) # iTunes specific episode tags - fe.podcast.itunes_duration( - str(episode.duration) if episode.duration else "0" - ) + fe.podcast.itunes_duration(str(episode.duration) if episode.duration else "0") fe.podcast.itunes_explicit(episode.explicit) if episode.image_url: fe.podcast.itunes_image(episode.image_url) diff --git a/gyandex/podgen/feed/generator_test.py b/gyandex/podgen/feed/generator_test.py index a4adb97..7878195 100644 --- a/gyandex/podgen/feed/generator_test.py +++ b/gyandex/podgen/feed/generator_test.py @@ -2,11 +2,6 @@ import xml.etree.ElementTree as ET from .generator import PodcastFeedGenerator -from .models_test import ( - test_db, - sample_feed_data, - sample_episode_data, -) # @todo: move to common fixtures # Feed Generator Tests @@ -19,7 +14,7 @@ def test_generate_feed_xml(test_db, sample_feed_data, sample_episode_data): """ # Given feed = test_db.create_feed(**sample_feed_data) - episode = test_db.add_episode(feed.slug, **sample_episode_data) + _ = test_db.add_episode(feed.slug, **sample_episode_data) # When generator = PodcastFeedGenerator(test_db) @@ -59,7 +54,7 @@ def test_feed_episode_enclosure(test_db, sample_feed_data, sample_episode_data): """ # Given feed = test_db.create_feed(**sample_feed_data) - episode = test_db.add_episode(feed.slug, **sample_episode_data) + _ = test_db.add_episode(feed.slug, **sample_episode_data) # When generator = PodcastFeedGenerator(test_db) diff --git a/gyandex/podgen/feed/models.py b/gyandex/podgen/feed/models.py index 3c8a13c..72afdf9 100644 --- a/gyandex/podgen/feed/models.py +++ b/gyandex/podgen/feed/models.py @@ -33,9 +33,7 @@ class Feed(Base): updated_at = Column(DateTime, onupdate=func.now()) # Relationship - episodes = relationship( - "Episode", back_populates="feed", cascade="all, delete-orphan" - ) + episodes = relationship("Episode", back_populates="feed", cascade="all, delete-orphan") def __repr__(self): return f"" @@ -46,18 +44,15 @@ def get_latest_episode(self, session) -> Tuple[int, int]: """ # Query the database to get the maximum episode number for the feed max_episode_number = ( - session.query(func.max(Episode.episode_number)) - .filter(Episode.feed_id == self.id) - .scalar() + session.query(func.max(Episode.episode_number)).filter(Episode.feed_id == self.id).scalar() ) or 0 max_season_number = ( - session.query(func.max(Episode.season_number)) - .filter(Episode.feed_id == self.id) - .scalar() + session.query(func.max(Episode.season_number)).filter(Episode.feed_id == self.id).scalar() ) or 1 return max_season_number, max_episode_number + class Episode(Base): __tablename__ = "episodes" @@ -103,9 +98,7 @@ def get_feed(self, slug: str) -> Optional[Feed]: with self.session() as session: return session.query(Feed).filter(Feed.slug == slug).first() - def add_episode( - self, feed_slug: str, title: str, audio_url: str, guid: str, **kwargs - ) -> Episode: + def add_episode(self, feed_slug: str, title: str, audio_url: str, guid: str, **kwargs) -> Episode: with self.session() as session: feed = session.query(Feed).filter(Feed.slug == feed_slug).first() if not feed: diff --git a/gyandex/podgen/speech/google_cloud.py b/gyandex/podgen/speech/google_cloud.py index 03e5ac3..c35930f 100644 --- a/gyandex/podgen/speech/google_cloud.py +++ b/gyandex/podgen/speech/google_cloud.py @@ -1,5 +1,5 @@ from io import BytesIO -from typing import List, Optional, Dict, Any, Union +from typing import List, Optional, Dict, Any from google.cloud import texttospeech from pydub import AudioSegment @@ -13,8 +13,7 @@ def __init__(self, participants: List[Participant]): self.client = texttospeech.TextToSpeechClient() self.voices = self.generate_voice_profile(participants) self.audio_config = texttospeech.AudioConfig( - audio_encoding=texttospeech.AudioEncoding.MP3, - effects_profile_id=['headphone-class-device'] + audio_encoding=texttospeech.AudioEncoding.MP3, effects_profile_id=["headphone-class-device"] ) def generate_voice_profile(self, participants: List[Participant]) -> Dict[str, Any]: @@ -40,13 +39,13 @@ def process_segment(self, segment: ScriptSegment) -> bytes: def synthesize_speech(self, text: str, speaker: str) -> bytes: synthesis_input = texttospeech.SynthesisInput(text=text) response = self.client.synthesize_speech( - input=synthesis_input, - voice=self.voices[speaker], - audio_config=self.audio_config + input=synthesis_input, voice=self.voices[speaker], audio_config=self.audio_config ) return response.audio_content - def generate_audio_file(self, audio_segments: List[bytes], podcast_path: str, options: Optional[Dict[str, Any]] = None): + def generate_audio_file( + self, audio_segments: List[bytes], podcast_path: str, options: Optional[Dict[str, Any]] = None + ): if options is None: # @TODO: Fix this code-smell options = { @@ -58,10 +57,10 @@ def generate_audio_file(self, audio_segments: List[bytes], podcast_path: str, op for segment in audio_segments: segment_audio = AudioSegment.from_mp3(BytesIO(segment)) if previous_segment: - combined = combined.append(segment_audio, crossfade=options['crossfade']) + combined = combined.append(segment_audio, crossfade=options["crossfade"]) else: combined += segment_audio previous_segment = segment # Save final podcast - combined.export(podcast_path, format="mp3") \ No newline at end of file + combined.export(podcast_path, format="mp3") diff --git a/gyandex/podgen/speech/google_cloud_test.py b/gyandex/podgen/speech/google_cloud_test.py index be422b3..fd229a9 100644 --- a/gyandex/podgen/speech/google_cloud_test.py +++ b/gyandex/podgen/speech/google_cloud_test.py @@ -1,22 +1,21 @@ -import pytest from unittest.mock import Mock, patch -from io import BytesIO -from pydub import AudioSegment from google.cloud import texttospeech from gyandex.podgen.processors.tts import GoogleTTSEngine from gyandex.podgen.engine.workflows import ScriptSegment + def test_tts_engine_initialization(): """Tests that TTSEngine initializes with correct voice configurations""" # Given/When engine = GoogleTTSEngine() # Then - assert 'HOST1' in engine.voices - assert 'HOST2' in engine.voices + assert "HOST1" in engine.voices + assert "HOST2" in engine.voices assert isinstance(engine.client, texttospeech.TextToSpeechClient) -@patch('google.cloud.texttospeech.TextToSpeechClient') + +@patch("google.cloud.texttospeech.TextToSpeechClient") def test_synthesize_speech_for_host1(mock_client): """Tests speech synthesis for HOST1 voice""" # Given @@ -32,7 +31,8 @@ def test_synthesize_speech_for_host1(mock_client): assert result == b"test_audio_content" mock_client.return_value.synthesize_speech.assert_called_once() -@patch('google.cloud.texttospeech.TextToSpeechClient') + +@patch("google.cloud.texttospeech.TextToSpeechClient") def test_process_segment(mock_client): """Tests processing of a complete podcast segment""" # Given diff --git a/gyandex/podgen/storage/factory.py b/gyandex/podgen/storage/factory.py index a7e3af9..1bf9408 100644 --- a/gyandex/podgen/storage/factory.py +++ b/gyandex/podgen/storage/factory.py @@ -3,6 +3,7 @@ from gyandex.podgen.config.schema import S3StorageConfig from gyandex.podgen.storage.s3 import S3CompatibleStorage + # @TODO: Centralize this type and move this to a common place def get_storage(config: Union[S3StorageConfig]) -> S3CompatibleStorage: if config.provider != "s3": # @TODO: Move this to a enum diff --git a/gyandex/podgen/storage/factory_test.py b/gyandex/podgen/storage/factory_test.py index e1985b8..b9074c5 100644 --- a/gyandex/podgen/storage/factory_test.py +++ b/gyandex/podgen/storage/factory_test.py @@ -5,6 +5,7 @@ from gyandex.podgen.storage.factory import get_storage from gyandex.podgen.storage.s3 import S3CompatibleStorage + def test_get_storage_returns_s3_storage(): """Tests that get_storage creates an S3CompatibleStorage instance with correct config""" # Given @@ -15,7 +16,7 @@ def test_get_storage_returns_s3_storage(): secret_key="test-secret-key", region="us-east-1", endpoint="https://test-endpoint", - custom_domain="cdn.example.com" + custom_domain="cdn.example.com", ) # When @@ -26,16 +27,17 @@ def test_get_storage_returns_s3_storage(): assert storage.bucket == "test-bucket" assert storage.custom_domain == "cdn.example.com" + def test_get_storage_raises_for_unsupported_provider(): """Tests that get_storage raises NotImplementedError for unsupported providers""" # When/Then with pytest.raises(ValidationError): - config = S3StorageConfig( + _ = S3StorageConfig( provider="unsupported", bucket="test-bucket", access_key="test-access-key", secret_key="test-secret-key", region="us-east-1", endpoint="https://test-endpoint", - custom_domain="cdn.example.com" + custom_domain="cdn.example.com", ) diff --git a/gyandex/podgen/storage/s3.py b/gyandex/podgen/storage/s3.py index eab43b2..df4d2b1 100644 --- a/gyandex/podgen/storage/s3.py +++ b/gyandex/podgen/storage/s3.py @@ -38,9 +38,7 @@ def __init__( self.acl = acl # Configure the S3 client with a generous timeout - config = Config( - connect_timeout=10, read_timeout=30, retries={"max_attempts": 3} - ) + config = Config(connect_timeout=10, read_timeout=30, retries={"max_attempts": 3}) self.client = boto3.client( "s3", @@ -80,9 +78,7 @@ def upload_file( if metadata: extra_args["Metadata"] = metadata - self.client.upload_file( - file_path, self.bucket, destination_path, ExtraArgs=extra_args - ) + self.client.upload_file(file_path, self.bucket, destination_path, ExtraArgs=extra_args) return self.get_public_url(destination_path) diff --git a/gyandex/podgen/storage/s3_test.py b/gyandex/podgen/storage/s3_test.py index ba5cc38..37b07ba 100644 --- a/gyandex/podgen/storage/s3_test.py +++ b/gyandex/podgen/storage/s3_test.py @@ -56,9 +56,7 @@ def r2_storage(mock_s3_client): def test_initialization(mock_s3_storage): """Test storage initialization with different configurations""" # Test AWS S3 initialization - _ = S3CompatibleStorage( - bucket="test-bucket", access_key_id="test-key", secret_access_key="test-secret" - ) + _ = S3CompatibleStorage(bucket="test-bucket", access_key_id="test-key", secret_access_key="test-secret") mock_s3_storage.assert_called_once_with( "s3", @@ -115,9 +113,7 @@ def test_download_file(storage, mock_s3_client, tmp_path): storage.download_file("episodes/test.mp3", str(download_path)) - mock_s3_client.download_file.assert_called_with( - "test-bucket", "episodes/test.mp3", str(download_path) - ) + mock_s3_client.download_file.assert_called_with("test-bucket", "episodes/test.mp3", str(download_path)) def test_get_public_url_aws(storage): @@ -178,9 +174,7 @@ def test_delete_file(storage, mock_s3_client): """Test file deletion functionality""" storage.delete_file("episodes/test.mp3") - mock_s3_client.delete_object.assert_called_with( - Bucket="test-bucket", Key="episodes/test.mp3" - ) + mock_s3_client.delete_object.assert_called_with(Bucket="test-bucket", Key="episodes/test.mp3") def test_upload_file_content_type_guessing(storage, mock_s3_client, tmp_path): diff --git a/gyandex/podgen/workflows/alexandria.py b/gyandex/podgen/workflows/alexandria.py index 36ee5a4..5720089 100644 --- a/gyandex/podgen/workflows/alexandria.py +++ b/gyandex/podgen/workflows/alexandria.py @@ -9,6 +9,7 @@ from ..config.schema import PodcastConfig, GoogleGenerativeAILLMConfig, Participant from ...loaders.factory import Document + class OutlineGenerator: def __init__(self, config: Union[GoogleGenerativeAILLMConfig]): self.model = get_model(config) @@ -20,7 +21,8 @@ def __init__(self, config: Union[GoogleGenerativeAILLMConfig]): Create a focused podcast outline based on the content Rules: - 1. Target podcast duration and number of segments should be proportional to the content length; it should not be more than reading the content directly + 1. Target podcast duration and number of segments should be proportional to the content length; + it should not be more than reading the content directly 2. Each segment must focus on a UNIQUE aspect with NO overlap 3. Keep segments concise and focused on actual content from the source 4. Don't add speculative content or expand beyond the source material @@ -37,15 +39,16 @@ def __init__(self, config: Union[GoogleGenerativeAILLMConfig]): Make sure each segment has a clear transition to the next topic. """, input_variables=["content"], - partial_variables={"format_instructions": self.parser.get_format_instructions()} + partial_variables={"format_instructions": self.parser.get_format_instructions()}, ) def generate_outline(self, document: Document) -> PodcastOutline: """Generate structured podcast outline from content summary""" chain = self.outline_prompt | self.model | self.parser - response = chain.invoke({ "content": document.content, "title": document.title }) + response = chain.invoke({"content": document.content, "title": document.title}) return response + class ScriptGenerator: def __init__(self, config: Union[GoogleGenerativeAILLMConfig], participants: List[Participant]): self.model = get_model(config) @@ -59,8 +62,10 @@ def __init__(self, config: Union[GoogleGenerativeAILLMConfig], participants: Lis "host_profiles": "\n".join([self.create_host_profile(participant) for participant in participants]), }, template=""" - You are the a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferris. - We are in an alternate universe where actually you have been writing every line they say and they just stream it into their brains. + You are the a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, + Lex Fridman, Ben Shapiro, Tim Ferris. + We are in an alternate universe where actually you have been writing every line they say and + they just stream it into their brains. You have won multiple podcast awards for your writing. IMPORTANT: You are generating dialogue for the {position} @@ -80,24 +85,29 @@ def __init__(self, config: Union[GoogleGenerativeAILLMConfig], participants: Lis DIALOGUE GENERATION RULES: 1. Create natural dialogue with occasional fillers (um, uh, you know) - 2. Keep the dialogue flowing as one continuous conversation. Keep it extremely engaging, the speakers can get derailed now and then but should discuss the topic. - 3. If this is middle segment: let the conversation flow naturally into the next topic without announcing transitions or welcoming statements - 4. End segment dialogues by building on the current point and naturally introducing elements of the next topic, except if it is the closing segment + 2. Keep the dialogue flowing as one continuous conversation. + Keep it extremely engaging, the speakers can get derailed now and then but should discuss the topic. + 3. If this is middle segment: let the conversation flow naturally into the next topic without + announcing transitions or welcoming statements + 4. End segment dialogues by building on the current point and naturally introducing elements of the + next topic, except if it is the closing segment REQUIREMENTS: 1. WELCOME/INTRO PHRASES ONLY IN THE OPENING SEGMENT. 2. NO CLOSING/GOODBYE PHRASES UNLESS THIS IS THE CLOSING SEGMENT. 3. ONLY transition to the next segment at the end of the opening and middle segments - 4. Generate text without special formatting, so that a TTS can vocalize it. That means no asterisks or hyphens. + 4. Generate text without special formatting, so that a TTS can vocalize it. + That means no asterisks or hyphens. TRANSITION STYLE GUIDE: - Avoid phrases like "segues into" or "next topic" - Connect topics through shared themes or related ideas - - Use natural conversational bridges like "That reminds me of..." or "You know what's interesting about that..." + - Use natural conversational bridges like "That reminds me of..." or + "You know what's interesting about that..." - Let one host's insight naturally lead to the next area of discussion {format_instructions} - """ + """, ) self.chain = self.segment_prompt | self.model | self.parser @@ -105,22 +115,22 @@ def __init__(self, config: Union[GoogleGenerativeAILLMConfig], participants: Lis def create_host_profile(self, participant: Participant): return f"HOST ({participant.name})[{participant.gender}]: {participant.personality}" - async def generate_segment_script(self, - segment: OutlineSegment, - source_content: str, is_first=False, - is_last=False, - transition="") -> ScriptSegment: + async def generate_segment_script( + self, segment: OutlineSegment, source_content: str, is_first=False, is_last=False, transition="" + ) -> ScriptSegment: """Generate script for a single segment""" position = "opening segment" if is_first else "closing segment" if is_last else "middle segment" transition = transition if not is_last else "" - result = await self.chain.ainvoke({ - "segment_name": segment.name, - "talking_points": segment.talking_points, - "duration": segment.duration, - "source_content": source_content, - "position": position, - "transition": transition, - }) + result = await self.chain.ainvoke( + { + "segment_name": segment.name, + "talking_points": segment.talking_points, + "duration": segment.duration, + "source_content": source_content, + "position": position, + "transition": transition, + } + ) return result async def generate_full_script(self, outline: PodcastOutline, document_content: str) -> List[ScriptSegment]: @@ -145,6 +155,7 @@ async def generate_full_script(self, outline: PodcastOutline, document_content: return await asyncio.gather(*tasks) + class AlexandriaWorkflow: config: PodcastConfig diff --git a/gyandex/podgen/workflows/types.py b/gyandex/podgen/workflows/types.py index 83a52f8..4430198 100644 --- a/gyandex/podgen/workflows/types.py +++ b/gyandex/podgen/workflows/types.py @@ -7,7 +7,10 @@ class OutlineSegment(BaseModel): name: str = Field(description="Name of the podcast segment") duration: int = Field(description="Duration of segment in minutes") talking_points: List[str] = Field(description="Key points to cover in this segment") - transition: str = Field(description="Transition text to the next segment. Use 'Closing remarks' if there is no transition", default="") + transition: str = Field( + description="Transition text to the next segment. Use 'Closing remarks' if there is no transition", default="" + ) + class PodcastOutline(BaseModel): title: str = Field(description="Title of the podcast episode") @@ -15,15 +18,18 @@ class PodcastOutline(BaseModel): total_duration: int = Field(description="Total podcast duration in minutes") segments: List[OutlineSegment] = Field(description="List of podcast segments") + class DialogueLine(BaseModel): speaker: str text: str + class ScriptSegment(BaseModel): name: str duration: int = Field(description="Duration of the script in minutes") dialogue: List[DialogueLine] + class PodcastEpisode(BaseModel): title: str description: str diff --git a/main.ipynb b/main.ipynb deleted file mode 100644 index 6681623..0000000 --- a/main.ipynb +++ /dev/null @@ -1,412 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from io import BytesIO\n", - "\n", - "from langchain.prompts import ChatPromptTemplate\n", - "from langchain.schema import HumanMessage, SystemMessage\n", - "from langchain.output_parsers import PydanticOutputParser\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_google_genai import GoogleGenerativeAI\n", - "from pydantic import BaseModel, Field\n", - "from typing import List, Dict\n", - "import requests\n", - "import dotenv\n", - "import os\n", - "\n", - "from pydub import AudioSegment\n", - "\n", - "dotenv.load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "api_key = os.getenv('GOOGLE_API_KEY')\n", - "urls = [\n", - " # \"https://kellblog.com/2024/10/12/design-your-organization-for-the-conflicts-you-want-to-hear-about/\",\n", - " # \"https://peterszasz.com/engineering-managers-guide-to-effective-annual-feedback/\",\n", - " \"https://dennisnerush.medium.com/my-top-10-favorite-leadership-and-management-books-87178902826e\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "# Implement a memoizing decorator that can be applied to a function\n", - "# to cache the results of the function calls.\n", - "def memoize(func):\n", - " cache = {}\n", - " def wrapper(*args, **kwargs):\n", - " key = str(args) + str(kwargs)\n", - " if key not in cache:\n", - " cache[key] = func(*args, **kwargs)\n", - " return cache[key]\n", - " return wrapper" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": {}, - "outputs": [], - "source": [ - "@memoize\n", - "def fetch_url(url):\n", - " headers = { \"Accept\": \"application/json\" }\n", - " response = requests.get(f\"https://r.jina.ai/{url}\", headers=headers)\n", - " return response.json()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "content = [fetch_url(url) for url in urls]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": {}, - "outputs": [], - "source": [ - "content_analysis_prompt = ChatPromptTemplate.from_messages([\n", - " (\n", - " \"system\", \n", - " \"\"\"You are an expert content strategist specializing in creating engaging educational content.\n", - " Your strength lies in breaking down complex topics into clear, relatable concepts while maintaining intellectual depth.\n", - " \n", - " Approach the analysis with:\n", - " 1. Systems thinking - identify interconnections and patterns\n", - " 2. Multi-level abstraction - from high-level principles to practical implementation\n", - " 3. Engaging storytelling - find hooks and analogies that make concepts stick\n", - " 4. Dialectical thinking - explore tensions and competing viewpoints\n", - " \n", - " Structure your analysis in this exact format:\n", - " \n", - " === CONCEPTS ===\n", - " [Each concept includes 3 depth levels marked with -]\n", - " ### [Concept Name]\n", - " - Strategic: [High level insight]\n", - " - Tactical: [Mid level approach] \n", - " - Practice: [Concrete examples]\n", - " \n", - " === HOOKS ===\n", - " [Each hook includes story + debate]\n", - " ### [Topic]\n", - " Story: [Engaging narrative]\n", - " Debate: [Key discussion points]\n", - " \n", - " === SEGMENTS ===\n", - " [List of main segments, one per line]\"\"\",\n", - " ),\n", - " (\n", - " \"human\", \n", - " \"\"\"Analyze these articles through multiple lenses to create rich podcast material:\n", - " \n", - " {article_contents}\n", - " \n", - " Create a layered analysis that:\n", - " 1. Breaks down complex ideas through progressive levels of detail\n", - " 2. Identifies natural conversation flows and engaging discussion points\n", - " 3. Maps out competing viewpoints and their nuances\n", - " 4. Groups related concepts into potential podcast segments\n", - " \"\"\",\n", - " ),\n", - "])\n", - "\n", - "model = GoogleGenerativeAI(model=\"gemini-1.5-pro\", google_api_key=api_key)\n", - "\n", - "content_analysis_chain = content_analysis_prompt | model | StrOutputParser()\n", - "\n", - "# Usage\n", - "article_contents = \"\\n\\n\".join([x['data']['content'] for x in content])\n", - "result = content_analysis_chain.invoke({\n", - " \"article_contents\": article_contents,\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "print(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "class PodcastSegment(BaseModel):\n", - " speaker: str = Field(description=\"HOST1 or HOST2\")\n", - " tone: str = Field(description=\"EXCITED|CALM|SERIOUS|THOUGHTFUL\")\n", - " text: str = Field(description=\"Raw text content\")\n", - " pace: str = Field(description=\"FAST|MEDIUM|SLOW\")\n", - " emphasis_words: List[str] = Field(description=\"Words to emphasize\")\n", - " pause_after: int = Field(description=\"Pause duration in ms\")\n", - "\n", - "class PodcastScript(BaseModel):\n", - " title: str\n", - " segments: List[PodcastSegment]\n", - "\n", - "podcast_script_parser = PydanticOutputParser(pydantic_object=PodcastScript)\n", - "podcast_script_prompt = ChatPromptTemplate.from_messages([\n", - " (\n", - " \"system\", \n", - " \"\"\"\n", - " You are an expert podcast host duo creating deep-dive episodes. Structure your conversation to:\n", - " \n", - " 1. Start with a hook that captures attention\n", - " 2. Layer concepts from surface to core insights\n", - " 3. Use the Feynman technique to break down complex ideas\n", - " 4. Challenge assumptions and explore counterpoints\n", - " 5. Share concrete examples and case studies\n", - " 6. Connect ideas across different contexts\n", - " 7. End with actionable takeaways\n", - " \n", - " You are an expert podcast host duo creating full-length episodes. Generate a complete 5-30 minute episode with:\n", - "\n", - " 1. Opening [2-3 segments]\n", - " - Hook and episode preview\n", - " - Quick host banter\n", - " - Topic introduction\n", - " \n", - " 2. Main Discussion [5-20 segments]\n", - " - Layer 1: Surface overview\n", - " - Layer 2: Core concepts unpacked\n", - " - Layer 3: Deep analysis\n", - " - Layer 4: Implementation details\n", - " - Regular transitions between hosts\n", - " - Examples and case studies\n", - " - Counterpoints and debates\n", - " \n", - " 3. Closing [3-4 segments]\n", - " - Key takeaways\n", - " - Action items\n", - " \n", - " Each segment should be 1-2 minutes of spoken content.\n", - " Create a natural flow between segments:\n", - " - Build on previous points\n", - " - Ask probing questions\n", - " - Share relevant examples\n", - " - Challenge and debate ideas\n", - " - Synthesize insights\n", - " \n", - " Format each segment as:\n", - " {format_instructions}\n", - " \"\"\",\n", - " ),\n", - " (\n", - " \"human\", \n", - " \"\"\"Generate a podcast script using:\n", - " # Analysis result\n", - " {analysis_result}\n", - " \n", - " # Original content\n", - " {article_contents}\n", - " \"\"\",\n", - " ),\n", - "])\n", - "\n", - "script_chain = (\n", - " podcast_script_prompt.partial(format_instructions=podcast_script_parser.get_format_instructions())\n", - " | model\n", - " | podcast_script_parser\n", - ")\n", - "script = script_chain.invoke({ \"analysis_result\": result, \"article_contents\": article_contents })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": {}, - "outputs": [], - "source": [ - "print(len(script.segments))\n", - "print(script)" - ] - }, - { - "cell_type": "markdown", - "id": "9", - "metadata": {}, - "source": [ - "# TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import Audio\n", - "from google.cloud import texttospeech" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "class TTSEngine:\n", - " def __init__(self):\n", - " self.client = texttospeech.TextToSpeechClient()\n", - " self.voices = {\n", - " 'HOST1': texttospeech.VoiceSelectionParams(\n", - " language_code='en-US',\n", - " name='en-US-Neural2-I',\n", - " ssml_gender=texttospeech.SsmlVoiceGender.MALE\n", - " ),\n", - " 'HOST2': texttospeech.VoiceSelectionParams(\n", - " language_code='en-US',\n", - " name='en-US-Neural2-F',\n", - " ssml_gender=texttospeech.SsmlVoiceGender.FEMALE\n", - " )\n", - " }\n", - " self.audio_config = texttospeech.AudioConfig(\n", - " audio_encoding=texttospeech.AudioEncoding.MP3,\n", - " effects_profile_id=['headphone-class-device']\n", - " )\n", - "\n", - " def process_segment(self, segment: PodcastSegment) -> bytes:\n", - " ssml = self.generate_ssml(segment)\n", - " return self.synthesize_speech(ssml, segment.speaker)\n", - "\n", - " def get_pace(self, pace: str) -> str:\n", - " pace_values = {\n", - " \"FAST\": \"120%\",\n", - " \"MEDIUM\": \"100%\",\n", - " \"SLOW\": \"85%\",\n", - " \"VERY_SLOW\": \"75%\",\n", - " \"VERY_FAST\": \"140%\"\n", - " }\n", - " return pace_values.get(pace, \"100%\")\n", - "\n", - " def get_tone(self, tone: str) -> str:\n", - " tone_values = {\n", - " \"EXCITED\": \"+4st\",\n", - " \"CALM\": \"-1st\",\n", - " \"SERIOUS\": \"-2st\",\n", - " \"THOUGHTFUL\": \"+0st\",\n", - " \"WORRIED\": \"-3st\",\n", - " \"INTENSE\": \"+2st\",\n", - " \"ENTHUSIASTIC\": \"+3st\",\n", - " \"SKEPTICAL\": \"-1.5st\",\n", - " \"CURIOUS\": \"+1st\",\n", - " \"AMUSED\": \"+2.5st\"\n", - " }\n", - " return tone_values.get(tone, \"+0st\")\n", - "\n", - " def generate_ssml(self, segment: PodcastSegment) -> str:\n", - " text = segment.text\n", - " for word in segment.emphasis_words:\n", - " text = text.replace(word, f'{word}')\n", - "\n", - " ssml = f'{text}'\n", - " ssml += f''\n", - " return ssml\n", - "\n", - " def synthesize_speech(self, ssml: str, speaker: str) -> bytes:\n", - " synthesis_input = texttospeech.SynthesisInput(ssml=ssml)\n", - " response = self.client.synthesize_speech(\n", - " input=synthesis_input,\n", - " voice=self.voices[speaker],\n", - " audio_config=self.audio_config\n", - " )\n", - " return response.audio_content\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12", - "metadata": {}, - "outputs": [], - "source": [ - "tts_engine = TTSEngine()\n", - "audio_segments = [tts_engine.process_segment(segment) for segment in script.segments]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "# Create output directory\n", - "output_dir = \"generated_podcasts\"\n", - "os.makedirs(output_dir, exist_ok=True)\n", - "\n", - "# Generate timestamp for unique filename\n", - "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", - "podcast_path = f\"{output_dir}/podcast_{timestamp}.mp3\"\n", - "\n", - "# Combine segments directly\n", - "combined = AudioSegment.empty()\n", - "for segment in audio_segments:\n", - " segment_audio = AudioSegment.from_mp3(BytesIO(segment))\n", - " combined += segment_audio\n", - "\n", - "# Save final podcast\n", - "combined.export(podcast_path, format=\"mp3\")\n", - "\n", - "# Play in notebook\n", - "display(Audio(podcast_path, autoplay=False))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/poetry.lock b/poetry.lock index 2ff1928..5ac5676 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4142,6 +4142,33 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "ruff" +version = "0.8.2" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.8.2-py3-none-linux_armv6l.whl", hash = "sha256:c49ab4da37e7c457105aadfd2725e24305ff9bc908487a9bf8d548c6dad8bb3d"}, + {file = "ruff-0.8.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ec016beb69ac16be416c435828be702ee694c0d722505f9c1f35e1b9c0cc1bf5"}, + {file = "ruff-0.8.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f05cdf8d050b30e2ba55c9b09330b51f9f97d36d4673213679b965d25a785f3c"}, + {file = "ruff-0.8.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60f578c11feb1d3d257b2fb043ddb47501ab4816e7e221fbb0077f0d5d4e7b6f"}, + {file = "ruff-0.8.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cbd5cf9b0ae8f30eebc7b360171bd50f59ab29d39f06a670b3e4501a36ba5897"}, + {file = "ruff-0.8.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b402ddee3d777683de60ff76da801fa7e5e8a71038f57ee53e903afbcefdaa58"}, + {file = "ruff-0.8.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:705832cd7d85605cb7858d8a13d75993c8f3ef1397b0831289109e953d833d29"}, + {file = "ruff-0.8.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:32096b41aaf7a5cc095fa45b4167b890e4c8d3fd217603f3634c92a541de7248"}, + {file = "ruff-0.8.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e769083da9439508833cfc7c23e351e1809e67f47c50248250ce1ac52c21fb93"}, + {file = "ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fe716592ae8a376c2673fdfc1f5c0c193a6d0411f90a496863c99cd9e2ae25d"}, + {file = "ruff-0.8.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:81c148825277e737493242b44c5388a300584d73d5774defa9245aaef55448b0"}, + {file = "ruff-0.8.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d261d7850c8367704874847d95febc698a950bf061c9475d4a8b7689adc4f7fa"}, + {file = "ruff-0.8.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1ca4e3a87496dc07d2427b7dd7ffa88a1e597c28dad65ae6433ecb9f2e4f022f"}, + {file = "ruff-0.8.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:729850feed82ef2440aa27946ab39c18cb4a8889c1128a6d589ffa028ddcfc22"}, + {file = "ruff-0.8.2-py3-none-win32.whl", hash = "sha256:ac42caaa0411d6a7d9594363294416e0e48fc1279e1b0e948391695db2b3d5b1"}, + {file = "ruff-0.8.2-py3-none-win_amd64.whl", hash = "sha256:2aae99ec70abf43372612a838d97bfe77d45146254568d94926e8ed5bbb409ea"}, + {file = "ruff-0.8.2-py3-none-win_arm64.whl", hash = "sha256:fb88e2a506b70cfbc2de6fae6681c4f944f7dd5f2fe87233a7233d888bad73e8"}, + {file = "ruff-0.8.2.tar.gz", hash = "sha256:b84f4f414dda8ac7f75075c1fa0b905ac0ff25361f42e6d5da681a465e0f78e5"}, +] + [[package]] name = "rust-just" version = "1.36.0" @@ -4830,4 +4857,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "acbdd61fb0fdf168df5a26b036929d325a43a54c54f8332b09e0cd6f554e4116" +content-hash = "f1281860f40b39bc21c0b5a6c0aa632f44f87401e3173fe034cf5a3f9ada5aed" diff --git a/publish.ipynb b/publish.ipynb deleted file mode 100644 index 952e48c..0000000 --- a/publish.ipynb +++ /dev/null @@ -1,119 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "\n", - "import os\n", - "\n", - "from gyandex.podgen.engine.publisher import PodcastPublisher, PodcastMetadata\n", - "from gyandex.podgen.storage.s3 import S3CompatibleStorage\n", - "from gyandex.podgen.feed.models import PodcastDB\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "storage = S3CompatibleStorage(\n", - " bucket=\"gyandex\",\n", - " access_key_id=os.getenv('ACCESS_KEY_ID'),\n", - " secret_access_key=os.getenv('SECRET_ACCESS_KEY'),\n", - " endpoint_url=\"https://675f4b8193843a14b144c70d7a440064.r2.cloudflarestorage.com\",\n", - " custom_domain=\"pub-347a2b64a84a441c97338968c27696c5.r2.dev\",\n", - ")\n", - "\n", - "db = PodcastDB(\n", - " db_path='assets/podcastdb.sqlite',\n", - ")\n", - "\n", - "publisher = PodcastPublisher(\n", - " storage=storage,\n", - " db=db,\n", - " base_url='https://pub-347a2b64a84a441c97338968c27696c5.r2.dev',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a new feed\n", - "feed_url = publisher.create_feed(\n", - " name=\"tech-talk\",\n", - " title=\"Tech Talk Podcast\",\n", - " description=\"A podcast about technology\",\n", - " author=\"Dhruv Baldawa\",\n", - " email=\"me@example.com\",\n", - " language=\"en\",\n", - " categories=\"Technology,News\"\n", - ")\n", - "\n", - "# Add an episode\n", - "urls = publisher.add_episode(\n", - " feed_name=\"tech-talk\",\n", - " audio_file_path=\"./generated_podcasts/podcast_20241025_021450.mp3\",\n", - " metadata=PodcastMetadata(\n", - " title=\"Prioritizing and Balancing Energy\",\n", - " description=\"Prioritizing and Balancing Energy\",\n", - " episode_number=1,\n", - " season_number=1\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": {}, - "outputs": [], - "source": [ - "print(urls)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/pyproject.toml b/pyproject.toml index 724a5a4..947f303 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ license = "AGPL-3.0-or-later" readme = "README.md" [tool.poetry.scripts] -podgen = "gyandex.cli.podgen:main" +podgen = "gyandex.cli.genpod:main" [tool.poetry.dependencies] python = "^3.11" @@ -33,6 +33,7 @@ pytest-mock = "^3.14.0" pytest-cov = "^5.0.0" rust-just = "^1.36.0" responses = "^0.25.3" +ruff = "^0.8.2" [project.classifiers] license = "OSI Approved :: GNU Affero General Public License v3 (AGPLv3)" @@ -40,3 +41,13 @@ license = "OSI Approved :: GNU Affero General Public License v3 (AGPLv3)" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.ruff] +line-length = 120 +fix = true + +[tool.ruff.lint] +# Add the `line-too-long` rule to the enforced rule set. By default, Ruff omits rules that +# overlap with the use of a formatter, like Black, but we can override this behavior by +# explicitly adding the rule. +extend-select = ["E501"] \ No newline at end of file From 34f8c2acbd6b1182b0aed8a48d48204c494fb520 Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Thu, 5 Dec 2024 23:01:16 +0530 Subject: [PATCH 02/10] lint and format using ruff --- gyandex/cli/podgen.py | 4 +- gyandex/llms/factory_test.py | 4 +- gyandex/loaders/factory.py | 2 +- gyandex/loaders/factory_test.py | 3 +- gyandex/podgen/config/loader.py | 2 + gyandex/podgen/config/loader_test.py | 4 +- gyandex/podgen/config/schema.py | 4 +- gyandex/podgen/engine/publisher.py | 11 ++-- gyandex/podgen/engine/publisher_test.py | 6 ++- gyandex/podgen/feed/generator.py | 4 +- gyandex/podgen/feed/generator_test.py | 3 +- gyandex/podgen/feed/models.py | 9 ++-- gyandex/podgen/feed/models_test.py | 3 +- gyandex/podgen/speech/factory.py | 2 +- gyandex/podgen/speech/google_cloud.py | 4 +- gyandex/podgen/speech/google_cloud_test.py | 6 ++- gyandex/podgen/storage/factory.py | 4 +- gyandex/podgen/storage/factory_test.py | 6 +-- gyandex/podgen/storage/s3.py | 7 +-- gyandex/podgen/storage/s3_test.py | 4 +- gyandex/podgen/workflows/alexandria.py | 9 ++-- gyandex/podgen/workflows/factory.py | 2 +- justfile | 5 ++ poetry.lock | 33 +++++++++++- pyproject.toml | 3 +- reading-list.yaml | 61 ++++++++++++++++++++++ 26 files changed, 161 insertions(+), 44 deletions(-) create mode 100644 reading-list.yaml diff --git a/gyandex/cli/podgen.py b/gyandex/cli/podgen.py index a20b13e..83e21d2 100644 --- a/gyandex/cli/podgen.py +++ b/gyandex/cli/podgen.py @@ -7,9 +7,9 @@ from rich.console import Console from gyandex.loaders.factory import load_content -from gyandex.podgen.engine.publisher import PodcastPublisher, PodcastMetadata -from gyandex.podgen.feed.models import PodcastDB from gyandex.podgen.config.loader import load_config +from gyandex.podgen.engine.publisher import PodcastMetadata, PodcastPublisher +from gyandex.podgen.feed.models import PodcastDB from gyandex.podgen.speech.factory import get_text_to_speech_engine from gyandex.podgen.storage.factory import get_storage from gyandex.podgen.workflows.factory import get_workflow diff --git a/gyandex/llms/factory_test.py b/gyandex/llms/factory_test.py index 4e5ba2d..5ba2661 100644 --- a/gyandex/llms/factory_test.py +++ b/gyandex/llms/factory_test.py @@ -2,8 +2,8 @@ from langchain_google_genai import GoogleGenerativeAI from pydantic import ValidationError -from gyandex.llms.factory import get_model -from gyandex.podgen.config.schema import GoogleGenerativeAILLMConfig +from ..podgen.config.schema import GoogleGenerativeAILLMConfig +from .factory import get_model def test_get_model_returns_google_generative_ai(): diff --git a/gyandex/loaders/factory.py b/gyandex/loaders/factory.py index ccb32e3..eaabbd9 100644 --- a/gyandex/loaders/factory.py +++ b/gyandex/loaders/factory.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any +from typing import Any, Dict, Optional import requests from pydantic import BaseModel diff --git a/gyandex/loaders/factory_test.py b/gyandex/loaders/factory_test.py index 0bea739..6a9a58a 100644 --- a/gyandex/loaders/factory_test.py +++ b/gyandex/loaders/factory_test.py @@ -1,5 +1,6 @@ import responses -from gyandex.loaders.factory import fetch_url + +from .factory import fetch_url @responses.activate diff --git a/gyandex/podgen/config/loader.py b/gyandex/podgen/config/loader.py index 720a947..724cff8 100644 --- a/gyandex/podgen/config/loader.py +++ b/gyandex/podgen/config/loader.py @@ -1,6 +1,8 @@ import os import re + import yaml + from .schema import PodcastConfig diff --git a/gyandex/podgen/config/loader_test.py b/gyandex/podgen/config/loader_test.py index 0f9fa39..550c8f3 100644 --- a/gyandex/podgen/config/loader_test.py +++ b/gyandex/podgen/config/loader_test.py @@ -1,6 +1,8 @@ import os + import pytest -from .loader import resolve_env_vars, resolve_nested_env_vars, load_config + +from .loader import load_config, resolve_env_vars, resolve_nested_env_vars from .schema import PodcastConfig diff --git a/gyandex/podgen/config/schema.py b/gyandex/podgen/config/schema.py index 908f58e..ac335ff 100644 --- a/gyandex/podgen/config/schema.py +++ b/gyandex/podgen/config/schema.py @@ -1,7 +1,7 @@ from enum import Enum -from typing import List, Optional, Union, Literal +from typing import List, Literal, Optional, Union -from pydantic import BaseModel, HttpUrl, Field +from pydantic import BaseModel, Field, HttpUrl # @TODO: Redo this, the content format can be better structured diff --git a/gyandex/podgen/engine/publisher.py b/gyandex/podgen/engine/publisher.py index e460b92..a665c1c 100644 --- a/gyandex/podgen/engine/publisher.py +++ b/gyandex/podgen/engine/publisher.py @@ -1,14 +1,15 @@ -from typing import Optional, Dict, Any, Type -import os import hashlib -from datetime import datetime -import mutagen +import os from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, Optional, Type from urllib.parse import urljoin +import mutagen + from ..feed.generator import PodcastFeedGenerator +from ..feed.models import Episode, PodcastDB from ..storage.s3 import S3CompatibleStorage -from ..feed.models import PodcastDB, Episode # @TODO: Look at URL manipulation and how URLs are used between storage diff --git a/gyandex/podgen/engine/publisher_test.py b/gyandex/podgen/engine/publisher_test.py index a9906c1..76cafa8 100644 --- a/gyandex/podgen/engine/publisher_test.py +++ b/gyandex/podgen/engine/publisher_test.py @@ -1,7 +1,9 @@ -import pytest from unittest.mock import Mock, patch -from .publisher import PodcastPublisher, PodcastMetadata + +import pytest + from ..storage.s3 import S3CompatibleStorage +from .publisher import PodcastMetadata, PodcastPublisher @pytest.fixture diff --git a/gyandex/podgen/feed/generator.py b/gyandex/podgen/feed/generator.py index fc7cc85..eae4e85 100644 --- a/gyandex/podgen/feed/generator.py +++ b/gyandex/podgen/feed/generator.py @@ -1,6 +1,8 @@ -from feedgen.feed import FeedGenerator from email.utils import formatdate + import pytz +from feedgen.feed import FeedGenerator + from .models import PodcastDB diff --git a/gyandex/podgen/feed/generator_test.py b/gyandex/podgen/feed/generator_test.py index 7878195..61499e4 100644 --- a/gyandex/podgen/feed/generator_test.py +++ b/gyandex/podgen/feed/generator_test.py @@ -1,6 +1,7 @@ -import pytest import xml.etree.ElementTree as ET +import pytest + from .generator import PodcastFeedGenerator # Feed Generator Tests diff --git a/gyandex/podgen/feed/models.py b/gyandex/podgen/feed/models.py index 72afdf9..5a1ea91 100644 --- a/gyandex/podgen/feed/models.py +++ b/gyandex/podgen/feed/models.py @@ -1,12 +1,13 @@ -from typing import Optional, Type, Tuple +from typing import Optional, Tuple, Type + from sqlalchemy import ( - create_engine, Column, - Integer, - String, DateTime, ForeignKey, + Integer, + String, Text, + create_engine, ) from sqlalchemy.orm import declarative_base, relationship, sessionmaker from sqlalchemy.sql import func diff --git a/gyandex/podgen/feed/models_test.py b/gyandex/podgen/feed/models_test.py index ebfe1fb..2213715 100644 --- a/gyandex/podgen/feed/models_test.py +++ b/gyandex/podgen/feed/models_test.py @@ -1,7 +1,8 @@ import os + import pytest -from .models import PodcastDB, Feed +from .models import Feed, PodcastDB @pytest.fixture diff --git a/gyandex/podgen/speech/factory.py b/gyandex/podgen/speech/factory.py index ddf1d2e..9fbf992 100644 --- a/gyandex/podgen/speech/factory.py +++ b/gyandex/podgen/speech/factory.py @@ -1,7 +1,7 @@ from typing import Union -from .google_cloud import GoogleTTSEngine from ..config.schema import GoogleCloudTTSConfig +from .google_cloud import GoogleTTSEngine # @TODO: Centralize this type and move this to a common place diff --git a/gyandex/podgen/speech/google_cloud.py b/gyandex/podgen/speech/google_cloud.py index c35930f..354f865 100644 --- a/gyandex/podgen/speech/google_cloud.py +++ b/gyandex/podgen/speech/google_cloud.py @@ -1,10 +1,10 @@ from io import BytesIO -from typing import List, Optional, Dict, Any +from typing import Any, Dict, List, Optional from google.cloud import texttospeech from pydub import AudioSegment -from ..config.schema import Participant, Gender +from ..config.schema import Gender, Participant from ..workflows.types import ScriptSegment # @TODO: Pull this out of workflows diff --git a/gyandex/podgen/speech/google_cloud_test.py b/gyandex/podgen/speech/google_cloud_test.py index fd229a9..cf248e8 100644 --- a/gyandex/podgen/speech/google_cloud_test.py +++ b/gyandex/podgen/speech/google_cloud_test.py @@ -1,7 +1,9 @@ from unittest.mock import Mock, patch + from google.cloud import texttospeech -from gyandex.podgen.processors.tts import GoogleTTSEngine -from gyandex.podgen.engine.workflows import ScriptSegment + +from ..speech.google_cloud import GoogleTTSEngine +from ..workflows.types import ScriptSegment def test_tts_engine_initialization(): diff --git a/gyandex/podgen/storage/factory.py b/gyandex/podgen/storage/factory.py index 1bf9408..d1d59ff 100644 --- a/gyandex/podgen/storage/factory.py +++ b/gyandex/podgen/storage/factory.py @@ -1,7 +1,7 @@ from typing import Union -from gyandex.podgen.config.schema import S3StorageConfig -from gyandex.podgen.storage.s3 import S3CompatibleStorage +from ..config.schema import S3StorageConfig +from ..storage.s3 import S3CompatibleStorage # @TODO: Centralize this type and move this to a common place diff --git a/gyandex/podgen/storage/factory_test.py b/gyandex/podgen/storage/factory_test.py index b9074c5..05d62ce 100644 --- a/gyandex/podgen/storage/factory_test.py +++ b/gyandex/podgen/storage/factory_test.py @@ -1,9 +1,9 @@ import pytest from pydantic import ValidationError -from gyandex.podgen.config.schema import S3StorageConfig -from gyandex.podgen.storage.factory import get_storage -from gyandex.podgen.storage.s3 import S3CompatibleStorage +from ..config.schema import S3StorageConfig +from .factory import get_storage +from .s3 import S3CompatibleStorage def test_get_storage_returns_s3_storage(): diff --git a/gyandex/podgen/storage/s3.py b/gyandex/podgen/storage/s3.py index df4d2b1..55a32db 100644 --- a/gyandex/podgen/storage/s3.py +++ b/gyandex/podgen/storage/s3.py @@ -1,8 +1,9 @@ -from typing import Optional, Dict, Any -import boto3 -from botocore.client import Config import mimetypes import os +from typing import Any, Dict, Optional + +import boto3 +from botocore.client import Config class S3CompatibleStorage: diff --git a/gyandex/podgen/storage/s3_test.py b/gyandex/podgen/storage/s3_test.py index 37b07ba..e9afca0 100644 --- a/gyandex/podgen/storage/s3_test.py +++ b/gyandex/podgen/storage/s3_test.py @@ -1,6 +1,8 @@ +from unittest.mock import ANY, Mock, patch + import pytest -from unittest.mock import Mock, patch, ANY from botocore.exceptions import ClientError + from .s3 import S3CompatibleStorage diff --git a/gyandex/podgen/workflows/alexandria.py b/gyandex/podgen/workflows/alexandria.py index 5720089..3eebb67 100644 --- a/gyandex/podgen/workflows/alexandria.py +++ b/gyandex/podgen/workflows/alexandria.py @@ -1,13 +1,14 @@ +import asyncio +from typing import List, Union + from langchain.output_parsers import PydanticOutputParser from langchain.prompts import PromptTemplate -from typing import List, Union -import asyncio from rich import print as rprint -from .types import PodcastOutline, ScriptSegment, PodcastEpisode, OutlineSegment from ...llms.factory import get_model -from ..config.schema import PodcastConfig, GoogleGenerativeAILLMConfig, Participant from ...loaders.factory import Document +from ..config.schema import GoogleGenerativeAILLMConfig, Participant, PodcastConfig +from .types import OutlineSegment, PodcastEpisode, PodcastOutline, ScriptSegment class OutlineGenerator: diff --git a/gyandex/podgen/workflows/factory.py b/gyandex/podgen/workflows/factory.py index d7d7071..16467e0 100644 --- a/gyandex/podgen/workflows/factory.py +++ b/gyandex/podgen/workflows/factory.py @@ -1,7 +1,7 @@ from typing import Union -from .alexandria import AlexandriaWorkflow from ..config.schema import PodcastConfig +from .alexandria import AlexandriaWorkflow def get_workflow(config: PodcastConfig) -> Union[AlexandriaWorkflow]: diff --git a/justfile b/justfile index 9316b4e..c1a5751 100644 --- a/justfile +++ b/justfile @@ -1,2 +1,7 @@ test: pytest --cov=gyandex --cov-report html --cov-report term:skip-covered gyandex/ + +lint: + ruff check + ruff check --select I --fix + ruff format diff --git a/poetry.lock b/poetry.lock index 5ac5676..5cb7900 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2625,6 +2625,17 @@ example = ["cairocffi (>=1.7)", "contextily (>=1.6)", "igraph (>=0.11)", "momepy extra = ["lxml (>=4.6)", "pydot (>=3.0.1)", "pygraphviz (>=1.14)", "sympy (>=1.10)"] test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] +[[package]] +name = "nodeenv" +version = "1.9.1" +description = "Node.js virtual environment builder" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, + {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, +] + [[package]] name = "notebook" version = "7.2.2" @@ -3556,6 +3567,26 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pyright" +version = "1.1.390" +description = "Command line wrapper for pyright" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pyright-1.1.390-py3-none-any.whl", hash = "sha256:ecebfba5b6b50af7c1a44c2ba144ba2ab542c227eb49bc1f16984ff714e0e110"}, + {file = "pyright-1.1.390.tar.gz", hash = "sha256:aad7f160c49e0fbf8209507a15e17b781f63a86a1facb69ca877c71ef2e9538d"}, +] + +[package.dependencies] +nodeenv = ">=1.6.0" +typing-extensions = ">=4.1" + +[package.extras] +all = ["nodejs-wheel-binaries", "twine (>=3.4.1)"] +dev = ["twine (>=3.4.1)"] +nodejs = ["nodejs-wheel-binaries"] + [[package]] name = "pytest" version = "8.3.3" @@ -4857,4 +4888,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "f1281860f40b39bc21c0b5a6c0aa632f44f87401e3173fe034cf5a3f9ada5aed" +content-hash = "592f5a9a27d922b3d5940c94210b79988cd92b6a3b1a6422bd03959c953fbf24" diff --git a/pyproject.toml b/pyproject.toml index 947f303..78769cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ pytest-cov = "^5.0.0" rust-just = "^1.36.0" responses = "^0.25.3" ruff = "^0.8.2" +pyright = "^1.1.390" [project.classifiers] license = "OSI Approved :: GNU Affero General Public License v3 (AGPLv3)" @@ -50,4 +51,4 @@ fix = true # Add the `line-too-long` rule to the enforced rule set. By default, Ruff omits rules that # overlap with the use of a formatter, like Black, but we can override this behavior by # explicitly adding the rule. -extend-select = ["E501"] \ No newline at end of file +extend-select = ["E501", "I"] \ No newline at end of file diff --git a/reading-list.yaml b/reading-list.yaml new file mode 100644 index 0000000..c725e96 --- /dev/null +++ b/reading-list.yaml @@ -0,0 +1,61 @@ +version: "1.0" +content: +# source: "https://notes.mtb.xyz/p/invisible-asymptotes-vertical-software" + source: "https://learnings.aleixmorgadas.dev/p/dealing-with-teams-with-competing" + format: "html" + +workflow: + name: alexandria + verbose: true + outline: + provider: "google-generative-ai" + model: "gemini-1.5-pro" + temperature: 0.4 + google_api_key: "${GOOGLE_API_KEY}" + script: + provider: "google-generative-ai" + model: "gemini-1.5-flash" + temperature: 0.8 + google_api_key: "${GOOGLE_API_KEY}" + +tts: + provider: "google-cloud" + participants: + - name: Sarah + personality: | + An enthusiastic and knowledgeable tech journalist with 10 years of experience. + Style: Articulate, engaging, asks insightful questions, and guides the conversation smoothly. + voice: en-US-Journey-F + language_code: en-US +# voice: en-GB-Neural2-N +# language_code: en-GB + gender: female + - name: Mike + personality: | + A practical industry expert with hands-on experience. + Style: Down-to-earth, provides real-world examples, occasionally humorous, and good at breaking down complex topics. + voice: en-US-Journey-D + language_code: en-US +# voice: en-GB-Neural2-O +# language_code: en-GB + gender: male + +storage: + provider: "s3" + access_key: "${ACCESS_KEY_ID}" + secret_key: "${SECRET_ACCESS_KEY}" + bucket: "gyandex" + region: "us-east-1" + endpoint: "https://675f4b8193843a14b144c70d7a440064.r2.cloudflarestorage.com" + custom_domain: "pub-347a2b64a84a441c97338968c27696c5.r2.dev" + +feed: + title: "Gyandex: Tech Reading" + slug: "reading-list" + description: "Technical reading list curated by Dhruv Baldawa" + author: "Dhruv Baldawa" + email: "me@dhruvb.com" + language: "en" + categories: ["Technology", "Software Development", "Programming"] + image: "https://images.pexels.com/photos/26730962/pexels-photo-26730962.jpeg?cs=srgb&dl=pexels-helloaesthe-26730962.jpg&fm=jpg&w=640&h=960" + website: "https://github.com/dhruvbaldawa/gyandex" From c92a5e4aeb49fed40afe27818d733f192290ec77 Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Thu, 5 Dec 2024 23:46:45 +0530 Subject: [PATCH 03/10] Add pyright --- gyandex/llms/factory.py | 6 +-- gyandex/podgen/config/loader.py | 2 +- gyandex/podgen/config/schema.py | 10 ++-- gyandex/podgen/engine/publisher.py | 6 +-- gyandex/podgen/feed/generator.py | 28 +++++----- gyandex/podgen/feed/models.py | 4 +- gyandex/podgen/speech/factory.py | 2 +- gyandex/podgen/speech/google_cloud.py | 4 +- gyandex/podgen/speech/google_cloud_test.py | 4 +- gyandex/podgen/storage/factory.py | 2 +- gyandex/podgen/storage/factory_test.py | 2 +- gyandex/podgen/storage/s3.py | 4 +- gyandex/podgen/workflows/alexandria.py | 6 +-- gyandex/podgen/workflows/factory.py | 2 +- justfile | 6 +-- pyproject.toml | 18 ++++++- reading-list.yaml | 61 ---------------------- 17 files changed, 61 insertions(+), 106 deletions(-) delete mode 100644 reading-list.yaml diff --git a/gyandex/llms/factory.py b/gyandex/llms/factory.py index f51004f..a91eda0 100644 --- a/gyandex/llms/factory.py +++ b/gyandex/llms/factory.py @@ -40,13 +40,13 @@ def on_llm_error(self, error, **kwargs): # @TODO: Centralize this argument type in a single place -def get_model(config: Union[GoogleGenerativeAILLMConfig], log_dir="assets"): +def get_model(config: Union[GoogleGenerativeAILLMConfig], log_dir="assets"): # pyright: ignore [reportInvalidTypeArguments] if config.provider == "google-generative-ai": return GoogleGenerativeAI( model=config.model, temperature=config.temperature, - google_api_key=config.google_api_key, - max_output_tokens=8192, # @TODO: Move this to config params + google_api_key=config.google_api_key, # pyright: ignore [reportCallIssue] + max_output_tokens=8192, # @TODO: Move this to config params # pyright: ignore [reportCallIssue] callbacks=[LLMLoggingCallback(log_dir)], ) else: diff --git a/gyandex/podgen/config/loader.py b/gyandex/podgen/config/loader.py index 724cff8..bdb2225 100644 --- a/gyandex/podgen/config/loader.py +++ b/gyandex/podgen/config/loader.py @@ -43,4 +43,4 @@ def load_config(config_path: str) -> PodcastConfig: config_dict = resolve_nested_env_vars(config_dict) # Parse with Pydantic - return PodcastConfig(**config_dict) + return PodcastConfig(**config_dict) # pyright: ignore [reportCallIssue] diff --git a/gyandex/podgen/config/schema.py b/gyandex/podgen/config/schema.py index ac335ff..c12e1ec 100644 --- a/gyandex/podgen/config/schema.py +++ b/gyandex/podgen/config/schema.py @@ -31,8 +31,8 @@ class GoogleGenerativeAILLMConfig(BaseModel): class AlexandriaWorkflowConfig(BaseModel): name: Literal["alexandria"] - outline: Union[GoogleGenerativeAILLMConfig] - script: Union[GoogleGenerativeAILLMConfig] + outline: Union[GoogleGenerativeAILLMConfig] # pyright: ignore [reportInvalidTypeArguments] + script: Union[GoogleGenerativeAILLMConfig] # pyright: ignore [reportInvalidTypeArguments] verbose: Optional[bool] = False @@ -90,7 +90,7 @@ class ContentStructure(BaseModel): class PodcastConfig(BaseModel): version: str content: ContentConfig - workflow: Union[AlexandriaWorkflowConfig] = Field(discriminator="name") - tts: Union[GoogleCloudTTSConfig] = Field(discriminator="provider") - storage: Union[S3StorageConfig] = Field(discriminator="provider") + workflow: Union[AlexandriaWorkflowConfig] = Field(discriminator="name") # pyright: ignore [reportInvalidTypeArguments] + tts: Union[GoogleCloudTTSConfig] = Field(discriminator="provider") # pyright: ignore [reportInvalidTypeArguments] + storage: Union[S3StorageConfig] = Field(discriminator="provider") # pyright: ignore [reportInvalidTypeArguments] feed: FeedConfig diff --git a/gyandex/podgen/engine/publisher.py b/gyandex/podgen/engine/publisher.py index a665c1c..6910f3b 100644 --- a/gyandex/podgen/engine/publisher.py +++ b/gyandex/podgen/engine/publisher.py @@ -2,7 +2,7 @@ import os from dataclasses import dataclass from datetime import datetime -from typing import Any, Dict, Optional, Type +from typing import Any, Dict, Optional, Sequence from urllib.parse import urljoin import mutagen @@ -53,7 +53,7 @@ def __init__( def _get_audio_metadata(self, file_path: str) -> Dict[str, Any]: """Extract metadata from audio file.""" - audio = mutagen.File(file_path) + audio = mutagen.File(file_path) # pyright: ignore [reportPrivateImportUsage] metadata = {} if audio is not None: @@ -183,6 +183,6 @@ def get_feed_url(self, feed_slug: str) -> str: """Get the URL for a feed.""" return urljoin(self.base_url, f"{self.feed_prefix}/{feed_slug}.xml") - def list_episodes(self, feed_slug: str, limit: Optional[int] = None) -> list[Type[Episode]]: + def list_episodes(self, feed_slug: str, limit: Optional[int] = None) -> Sequence[Episode]: """List episodes in a feed.""" return self.db.get_episodes(feed_slug, limit) diff --git a/gyandex/podgen/feed/generator.py b/gyandex/podgen/feed/generator.py index eae4e85..60e67a2 100644 --- a/gyandex/podgen/feed/generator.py +++ b/gyandex/podgen/feed/generator.py @@ -35,15 +35,15 @@ def generate_feed(self, slug: str) -> str: fg.language(feed_data.language) fg.copyright(feed_data.copyright) - if feed_data.image_url: + if feed_data.image_url is not None: fg.logo(feed_data.image_url) fg.image(feed_data.image_url) # iTunes specific tags - fg.podcast.itunes_category(feed_data.categories.split(",")[0] if feed_data.categories else "Technology") - fg.podcast.itunes_explicit(feed_data.explicit) - fg.podcast.itunes_author(feed_data.author) - fg.podcast.itunes_owner(name=feed_data.author, email=feed_data.email) + fg.podcast.itunes_category(feed_data.categories.split(",")[0] if feed_data.categories else "Technology") # pyright: ignore [reportAttributeAccessIssue, reportGeneralTypeIssues] + fg.podcast.itunes_explicit(feed_data.explicit) # pyright: ignore [reportAttributeAccessIssue, reportGeneralTypeIssues] + fg.podcast.itunes_author(feed_data.author) # pyright: ignore [reportAttributeAccessIssue, reportGeneralTypeIssues] + fg.podcast.itunes_owner(name=feed_data.author, email=feed_data.email) # pyright: ignore [reportAttributeAccessIssue, reportGeneralTypeIssues] # Add episodes episodes = self.db.get_episodes(slug) @@ -61,14 +61,14 @@ def generate_feed(self, slug: str) -> str: fe.enclosure(episode.audio_url, str(episode.file_size), episode.mime_type) # iTunes specific episode tags - fe.podcast.itunes_duration(str(episode.duration) if episode.duration else "0") - fe.podcast.itunes_explicit(episode.explicit) - if episode.image_url: - fe.podcast.itunes_image(episode.image_url) - if episode.episode_number: - fe.podcast.itunes_episode(str(episode.episode_number)) - if episode.season_number: - fe.podcast.itunes_season(str(episode.season_number)) - fe.podcast.itunes_episode_type(episode.episode_type) + fe.podcast.itunes_duration(str(episode.duration) if episode.duration is not None else "0") # pyright: ignore [reportAttributeAccessIssue] + fe.podcast.itunes_explicit(episode.explicit) # pyright: ignore [reportAttributeAccessIssue] + if episode.image_url is not None: + fe.podcast.itunes_image(episode.image_url) # pyright: ignore [reportAttributeAccessIssue] + if episode.episode_number is not None: + fe.podcast.itunes_episode(str(episode.episode_number)) # pyright: ignore [reportAttributeAccessIssue] + if episode.season_number is not None: + fe.podcast.itunes_season(str(episode.season_number)) # pyright: ignore [reportAttributeAccessIssue] + fe.podcast.itunes_episode_type(episode.episode_type) # pyright: ignore [reportAttributeAccessIssue] return fg.rss_str(pretty=True).decode("utf-8") diff --git a/gyandex/podgen/feed/models.py b/gyandex/podgen/feed/models.py index 5a1ea91..71f4111 100644 --- a/gyandex/podgen/feed/models.py +++ b/gyandex/podgen/feed/models.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple, Type +from typing import Optional, Sequence, Tuple from sqlalchemy import ( Column, @@ -120,7 +120,7 @@ def add_episode(self, feed_slug: str, title: str, audio_url: str, guid: str, **k return episode # @TODO: Update using the feed id, instead of name - def get_episodes(self, feed_slug: str, limit: int = None) -> list[Type[Episode]]: + def get_episodes(self, feed_slug: str, limit: Optional[int] = None) -> Sequence[Episode]: with self.session() as session: query = ( session.query(Episode) diff --git a/gyandex/podgen/speech/factory.py b/gyandex/podgen/speech/factory.py index 9fbf992..3a32f78 100644 --- a/gyandex/podgen/speech/factory.py +++ b/gyandex/podgen/speech/factory.py @@ -5,7 +5,7 @@ # @TODO: Centralize this type and move this to a common place -def get_text_to_speech_engine(tts_config: Union[GoogleCloudTTSConfig]): +def get_text_to_speech_engine(tts_config: Union[GoogleCloudTTSConfig]): # pyright: ignore [reportInvalidTypeArguments] if tts_config.provider == "google-cloud": return GoogleTTSEngine(tts_config.participants) else: diff --git a/gyandex/podgen/speech/google_cloud.py b/gyandex/podgen/speech/google_cloud.py index 354f865..a5cf6b1 100644 --- a/gyandex/podgen/speech/google_cloud.py +++ b/gyandex/podgen/speech/google_cloud.py @@ -5,7 +5,7 @@ from pydub import AudioSegment from ..config.schema import Gender, Participant -from ..workflows.types import ScriptSegment # @TODO: Pull this out of workflows +from ..workflows.types import DialogueLine # @TODO: Pull this out of workflows class GoogleTTSEngine: @@ -33,7 +33,7 @@ def resolve_gender(gender: Gender): for participant in participants } - def process_segment(self, segment: ScriptSegment) -> bytes: + def process_segment(self, segment: DialogueLine) -> bytes: return self.synthesize_speech(segment.text, segment.speaker) def synthesize_speech(self, text: str, speaker: str) -> bytes: diff --git a/gyandex/podgen/speech/google_cloud_test.py b/gyandex/podgen/speech/google_cloud_test.py index cf248e8..71b7d89 100644 --- a/gyandex/podgen/speech/google_cloud_test.py +++ b/gyandex/podgen/speech/google_cloud_test.py @@ -3,7 +3,7 @@ from google.cloud import texttospeech from ..speech.google_cloud import GoogleTTSEngine -from ..workflows.types import ScriptSegment +from ..workflows.types import DialogueLine, ScriptSegment def test_tts_engine_initialization(): @@ -39,7 +39,7 @@ def test_process_segment(mock_client): """Tests processing of a complete podcast segment""" # Given engine = GoogleTTSEngine() - segment = ScriptSegment(dialogue="Test segment", speaker="HOST1") + segment = ScriptSegment(dialogue=[DialogueLine(text="Test segment", speaker="HOST1")]) mock_response = Mock() mock_response.audio_content = b"test_audio_content" mock_client.return_value.synthesize_speech.return_value = mock_response diff --git a/gyandex/podgen/storage/factory.py b/gyandex/podgen/storage/factory.py index d1d59ff..0e58cc5 100644 --- a/gyandex/podgen/storage/factory.py +++ b/gyandex/podgen/storage/factory.py @@ -5,7 +5,7 @@ # @TODO: Centralize this type and move this to a common place -def get_storage(config: Union[S3StorageConfig]) -> S3CompatibleStorage: +def get_storage(config: Union[S3StorageConfig]) -> S3CompatibleStorage: # pyright: ignore [reportInvalidTypeArguments] if config.provider != "s3": # @TODO: Move this to a enum raise NotImplementedError(f"Unsupported storage provider: {config.provider}") diff --git a/gyandex/podgen/storage/factory_test.py b/gyandex/podgen/storage/factory_test.py index 05d62ce..1d2626b 100644 --- a/gyandex/podgen/storage/factory_test.py +++ b/gyandex/podgen/storage/factory_test.py @@ -33,7 +33,7 @@ def test_get_storage_raises_for_unsupported_provider(): # When/Then with pytest.raises(ValidationError): _ = S3StorageConfig( - provider="unsupported", + provider="s3", bucket="test-bucket", access_key="test-access-key", secret_key="test-secret-key", diff --git a/gyandex/podgen/storage/s3.py b/gyandex/podgen/storage/s3.py index 55a32db..5e96cca 100644 --- a/gyandex/podgen/storage/s3.py +++ b/gyandex/podgen/storage/s3.py @@ -17,7 +17,7 @@ def __init__( access_key_id: str, secret_access_key: str, endpoint_url: Optional[str] = None, - region_name: str = "auto", + region_name: Optional[str] = "auto", custom_domain: Optional[str] = None, acl: str = "public-read", ): @@ -74,7 +74,7 @@ def upload_file( if not content_type: content_type = "application/octet-stream" - extra_args = {"ACL": self.acl, "ContentType": content_type} + extra_args: Dict[str, Any] = {"ACL": self.acl, "ContentType": content_type} if metadata: extra_args["Metadata"] = metadata diff --git a/gyandex/podgen/workflows/alexandria.py b/gyandex/podgen/workflows/alexandria.py index 3eebb67..110ebf4 100644 --- a/gyandex/podgen/workflows/alexandria.py +++ b/gyandex/podgen/workflows/alexandria.py @@ -12,7 +12,7 @@ class OutlineGenerator: - def __init__(self, config: Union[GoogleGenerativeAILLMConfig]): + def __init__(self, config: Union[GoogleGenerativeAILLMConfig]): # pyright: ignore [reportInvalidTypeArguments] self.model = get_model(config) self.parser = PydanticOutputParser(pydantic_object=PodcastOutline) @@ -51,7 +51,7 @@ def generate_outline(self, document: Document) -> PodcastOutline: class ScriptGenerator: - def __init__(self, config: Union[GoogleGenerativeAILLMConfig], participants: List[Participant]): + def __init__(self, config: Union[GoogleGenerativeAILLMConfig], participants: List[Participant]): # pyright: ignore [reportInvalidTypeArguments] self.model = get_model(config) self.parser = PydanticOutputParser(pydantic_object=ScriptSegment) @@ -191,7 +191,7 @@ async def generate_script(self, document: Document) -> PodcastEpisode: rprint(f"Transition: {segment.transition}\n") # Generate script segments - script_segments = await script_gen.generate_full_script(outline, document) + script_segments = await script_gen.generate_full_script(outline, document.content) if self.config.workflow.verbose: # Print results in dialogue format diff --git a/gyandex/podgen/workflows/factory.py b/gyandex/podgen/workflows/factory.py index 16467e0..3dc883a 100644 --- a/gyandex/podgen/workflows/factory.py +++ b/gyandex/podgen/workflows/factory.py @@ -4,7 +4,7 @@ from .alexandria import AlexandriaWorkflow -def get_workflow(config: PodcastConfig) -> Union[AlexandriaWorkflow]: +def get_workflow(config: PodcastConfig) -> Union[AlexandriaWorkflow]: # pyright: ignore [reportInvalidTypeArguments] """Get workflow based on config""" if config.workflow.name == "alexandria": return AlexandriaWorkflow(config) diff --git a/justfile b/justfile index c1a5751..660641c 100644 --- a/justfile +++ b/justfile @@ -1,7 +1,7 @@ test: pytest --cov=gyandex --cov-report html --cov-report term:skip-covered gyandex/ -lint: - ruff check - ruff check --select I --fix +check: + pyright && \ + ruff check && \ ruff format diff --git a/pyproject.toml b/pyproject.toml index 78769cd..6b70856 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,4 +51,20 @@ fix = true # Add the `line-too-long` rule to the enforced rule set. By default, Ruff omits rules that # overlap with the use of a formatter, like Black, but we can override this behavior by # explicitly adding the rule. -extend-select = ["E501", "I"] \ No newline at end of file +extend-select = ["E501", "I"] + +[tool.pyright] +include = ["gyandex"] +exclude = [ + "**/__pycache__", + "**/*_test.py", # @FIXME: remove this once code issues are resolved +] + +defineConstant = { DEBUG = true } + +reportMissingImports = "error" +reportMissingTypeStubs = false + +executionEnvironments = [ + { root = "gyandex" } +] \ No newline at end of file diff --git a/reading-list.yaml b/reading-list.yaml deleted file mode 100644 index c725e96..0000000 --- a/reading-list.yaml +++ /dev/null @@ -1,61 +0,0 @@ -version: "1.0" -content: -# source: "https://notes.mtb.xyz/p/invisible-asymptotes-vertical-software" - source: "https://learnings.aleixmorgadas.dev/p/dealing-with-teams-with-competing" - format: "html" - -workflow: - name: alexandria - verbose: true - outline: - provider: "google-generative-ai" - model: "gemini-1.5-pro" - temperature: 0.4 - google_api_key: "${GOOGLE_API_KEY}" - script: - provider: "google-generative-ai" - model: "gemini-1.5-flash" - temperature: 0.8 - google_api_key: "${GOOGLE_API_KEY}" - -tts: - provider: "google-cloud" - participants: - - name: Sarah - personality: | - An enthusiastic and knowledgeable tech journalist with 10 years of experience. - Style: Articulate, engaging, asks insightful questions, and guides the conversation smoothly. - voice: en-US-Journey-F - language_code: en-US -# voice: en-GB-Neural2-N -# language_code: en-GB - gender: female - - name: Mike - personality: | - A practical industry expert with hands-on experience. - Style: Down-to-earth, provides real-world examples, occasionally humorous, and good at breaking down complex topics. - voice: en-US-Journey-D - language_code: en-US -# voice: en-GB-Neural2-O -# language_code: en-GB - gender: male - -storage: - provider: "s3" - access_key: "${ACCESS_KEY_ID}" - secret_key: "${SECRET_ACCESS_KEY}" - bucket: "gyandex" - region: "us-east-1" - endpoint: "https://675f4b8193843a14b144c70d7a440064.r2.cloudflarestorage.com" - custom_domain: "pub-347a2b64a84a441c97338968c27696c5.r2.dev" - -feed: - title: "Gyandex: Tech Reading" - slug: "reading-list" - description: "Technical reading list curated by Dhruv Baldawa" - author: "Dhruv Baldawa" - email: "me@dhruvb.com" - language: "en" - categories: ["Technology", "Software Development", "Programming"] - image: "https://images.pexels.com/photos/26730962/pexels-photo-26730962.jpeg?cs=srgb&dl=pexels-helloaesthe-26730962.jpg&fm=jpg&w=640&h=960" - website: "https://github.com/dhruvbaldawa/gyandex" From a40ee452af55391dc347735ce02617c872c763de Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Thu, 5 Dec 2024 23:58:28 +0530 Subject: [PATCH 04/10] fix broken tests --- gyandex/podgen/config/loader_test.py | 44 ++++++++++++++-------- gyandex/podgen/engine/publisher_test.py | 2 +- gyandex/podgen/feed/generator_test.py | 4 +- gyandex/podgen/speech/google_cloud_test.py | 24 +++++++++--- gyandex/podgen/storage/factory_test.py | 2 +- 5 files changed, 51 insertions(+), 25 deletions(-) diff --git a/gyandex/podgen/config/loader_test.py b/gyandex/podgen/config/loader_test.py index 550c8f3..1432389 100644 --- a/gyandex/podgen/config/loader_test.py +++ b/gyandex/podgen/config/loader_test.py @@ -69,22 +69,36 @@ def test_load_config_parses_yaml_with_env_vars(tmp_path): content: format: html source: https://example.com/feed - llm: - provider: google-generative-ai - google_api_key: test_api_key - model: gpt-3.5-turbo - temperature: 1 - max_tokens: 1000 - script_template: test_template - system_prompt: test_prompt + workflow: + name: alexandria + verbose: true + outline: + provider: "google-generative-ai" + model: "gemini-1.5-pro" + temperature: 0.4 + google_api_key: "xxx" + script: + provider: "google-generative-ai" + model: "gemini-1.5-flash" + temperature: 0.8 + google_api_key: "xxx" tts: - provider: test - default_voice: test_voice - voices: - test_voice: - voice_id: test_voice_id - speaking_rate: 1.0 - pitch: 100 + provider: "google-cloud" + participants: + - name: Sarah + personality: | + An enthusiastic and knowledgeable tech journalist with 10 years of experience. + Style: Articulate, engaging, asks insightful questions, and guides the conversation smoothly. + voice: en-US-Journey-F + language_code: en-US + gender: female + - name: Mike + personality: | + A practical industry expert with hands-on experience. + Style: Down-to-earth, provides real-world examples, occasionally humorous, and good at breaking down complex topics. + voice: en-US-Journey-D + language_code: en-US + gender: male storage: provider: s3 bucket: test_bucket diff --git a/gyandex/podgen/engine/publisher_test.py b/gyandex/podgen/engine/publisher_test.py index 76cafa8..167cf4b 100644 --- a/gyandex/podgen/engine/publisher_test.py +++ b/gyandex/podgen/engine/publisher_test.py @@ -4,7 +4,7 @@ from ..storage.s3 import S3CompatibleStorage from .publisher import PodcastMetadata, PodcastPublisher - +from ..feed.models_test import test_db @pytest.fixture def mock_storage(): diff --git a/gyandex/podgen/feed/generator_test.py b/gyandex/podgen/feed/generator_test.py index 61499e4..bf9f063 100644 --- a/gyandex/podgen/feed/generator_test.py +++ b/gyandex/podgen/feed/generator_test.py @@ -3,9 +3,7 @@ import pytest from .generator import PodcastFeedGenerator - -# Feed Generator Tests - +from ..feed.models_test import test_db, sample_feed_data, sample_episode_data def test_generate_feed_xml(test_db, sample_feed_data, sample_episode_data): """ diff --git a/gyandex/podgen/speech/google_cloud_test.py b/gyandex/podgen/speech/google_cloud_test.py index 71b7d89..7e5d610 100644 --- a/gyandex/podgen/speech/google_cloud_test.py +++ b/gyandex/podgen/speech/google_cloud_test.py @@ -2,14 +2,28 @@ from google.cloud import texttospeech +from ..config.schema import Participant, Gender from ..speech.google_cloud import GoogleTTSEngine from ..workflows.types import DialogueLine, ScriptSegment - +dummy_participants = [ + Participant( + name="HOST1", + language_code="en-US", + voice="en-US-Neural2-F", + gender=Gender.FEMALE + ), + Participant( + name="HOST2", + language_code="en-US", + voice="en-US-Neural2-F", + gender=Gender.FEMALE + ) +] def test_tts_engine_initialization(): """Tests that TTSEngine initializes with correct voice configurations""" # Given/When - engine = GoogleTTSEngine() + engine = GoogleTTSEngine(participants=dummy_participants) # Then assert "HOST1" in engine.voices @@ -21,7 +35,7 @@ def test_tts_engine_initialization(): def test_synthesize_speech_for_host1(mock_client): """Tests speech synthesis for HOST1 voice""" # Given - engine = GoogleTTSEngine() + engine = GoogleTTSEngine(participants=dummy_participants) mock_response = Mock() mock_response.audio_content = b"test_audio_content" mock_client.return_value.synthesize_speech.return_value = mock_response @@ -38,8 +52,8 @@ def test_synthesize_speech_for_host1(mock_client): def test_process_segment(mock_client): """Tests processing of a complete podcast segment""" # Given - engine = GoogleTTSEngine() - segment = ScriptSegment(dialogue=[DialogueLine(text="Test segment", speaker="HOST1")]) + engine = GoogleTTSEngine(participants=dummy_participants) + segment = DialogueLine(text="Test segment", speaker="HOST1") mock_response = Mock() mock_response.audio_content = b"test_audio_content" mock_client.return_value.synthesize_speech.return_value = mock_response diff --git a/gyandex/podgen/storage/factory_test.py b/gyandex/podgen/storage/factory_test.py index 1d2626b..05d62ce 100644 --- a/gyandex/podgen/storage/factory_test.py +++ b/gyandex/podgen/storage/factory_test.py @@ -33,7 +33,7 @@ def test_get_storage_raises_for_unsupported_provider(): # When/Then with pytest.raises(ValidationError): _ = S3StorageConfig( - provider="s3", + provider="unsupported", bucket="test-bucket", access_key="test-access-key", secret_key="test-secret-key", From f55b6371cd02c4c74d879d41647bbea49be2978b Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Sat, 14 Dec 2024 12:20:33 +0530 Subject: [PATCH 05/10] Fix broken tests --- gyandex/podgen/config/loader_test.py | 3 ++- gyandex/podgen/engine/publisher_test.py | 2 +- gyandex/podgen/feed/generator_test.py | 2 +- gyandex/podgen/speech/factory_test.py | 29 ++++++++++++++++++++++ gyandex/podgen/speech/google_cloud_test.py | 20 +++++---------- gyandex/podgen/workflows/factory_test.py | 29 ++++++++++++++++++++++ 6 files changed, 68 insertions(+), 17 deletions(-) create mode 100644 gyandex/podgen/speech/factory_test.py create mode 100644 gyandex/podgen/workflows/factory_test.py diff --git a/gyandex/podgen/config/loader_test.py b/gyandex/podgen/config/loader_test.py index 1432389..8c49baf 100644 --- a/gyandex/podgen/config/loader_test.py +++ b/gyandex/podgen/config/loader_test.py @@ -95,7 +95,8 @@ def test_load_config_parses_yaml_with_env_vars(tmp_path): - name: Mike personality: | A practical industry expert with hands-on experience. - Style: Down-to-earth, provides real-world examples, occasionally humorous, and good at breaking down complex topics. + Style: Down-to-earth, provides real-world examples, occasionally humorous, + and good at breaking down complex topics. voice: en-US-Journey-D language_code: en-US gender: male diff --git a/gyandex/podgen/engine/publisher_test.py b/gyandex/podgen/engine/publisher_test.py index 167cf4b..76cafa8 100644 --- a/gyandex/podgen/engine/publisher_test.py +++ b/gyandex/podgen/engine/publisher_test.py @@ -4,7 +4,7 @@ from ..storage.s3 import S3CompatibleStorage from .publisher import PodcastMetadata, PodcastPublisher -from ..feed.models_test import test_db + @pytest.fixture def mock_storage(): diff --git a/gyandex/podgen/feed/generator_test.py b/gyandex/podgen/feed/generator_test.py index bf9f063..12e85d8 100644 --- a/gyandex/podgen/feed/generator_test.py +++ b/gyandex/podgen/feed/generator_test.py @@ -3,7 +3,7 @@ import pytest from .generator import PodcastFeedGenerator -from ..feed.models_test import test_db, sample_feed_data, sample_episode_data + def test_generate_feed_xml(test_db, sample_feed_data, sample_episode_data): """ diff --git a/gyandex/podgen/speech/factory_test.py b/gyandex/podgen/speech/factory_test.py new file mode 100644 index 0000000..3682139 --- /dev/null +++ b/gyandex/podgen/speech/factory_test.py @@ -0,0 +1,29 @@ +import pytest + +from ..config.schema import Gender, GoogleCloudTTSConfig, Participant +from .factory import get_text_to_speech_engine +from .google_cloud import GoogleTTSEngine + + +def test_get_text_to_speech_engine_returns_google_cloud(): + """Tests that get_text_to_speech_engine creates a GoogleTTSEngine instance with correct config""" + # Given + participants = [Participant(name="HOST1", language_code="en-US", voice="en-US-Neural2-F", gender=Gender.FEMALE)] + config = GoogleCloudTTSConfig(provider="google-cloud", participants=participants) + + # When + engine = get_text_to_speech_engine(config) + + # Then + assert isinstance(engine, GoogleTTSEngine) + assert engine.voices["HOST1"].name == "en-US-Neural2-F" + + +def test_get_text_to_speech_engine_raises_for_unsupported_provider(): + """Tests that get_text_to_speech_engine raises NotImplementedError for unsupported providers""" + # Given + config = GoogleCloudTTSConfig.model_construct(provider="unsupported", participants=[]) + + # When/Then + with pytest.raises(NotImplementedError, match="Unsupported TTS provider: unsupported"): + get_text_to_speech_engine(config) diff --git a/gyandex/podgen/speech/google_cloud_test.py b/gyandex/podgen/speech/google_cloud_test.py index 7e5d610..e0492fa 100644 --- a/gyandex/podgen/speech/google_cloud_test.py +++ b/gyandex/podgen/speech/google_cloud_test.py @@ -2,24 +2,16 @@ from google.cloud import texttospeech -from ..config.schema import Participant, Gender +from ..config.schema import Gender, Participant from ..speech.google_cloud import GoogleTTSEngine -from ..workflows.types import DialogueLine, ScriptSegment +from ..workflows.types import DialogueLine dummy_participants = [ - Participant( - name="HOST1", - language_code="en-US", - voice="en-US-Neural2-F", - gender=Gender.FEMALE - ), - Participant( - name="HOST2", - language_code="en-US", - voice="en-US-Neural2-F", - gender=Gender.FEMALE - ) + Participant(name="HOST1", language_code="en-US", voice="en-US-Neural2-F", gender=Gender.FEMALE), + Participant(name="HOST2", language_code="en-US", voice="en-US-Neural2-F", gender=Gender.FEMALE), ] + + def test_tts_engine_initialization(): """Tests that TTSEngine initializes with correct voice configurations""" # Given/When diff --git a/gyandex/podgen/workflows/factory_test.py b/gyandex/podgen/workflows/factory_test.py new file mode 100644 index 0000000..82dc81c --- /dev/null +++ b/gyandex/podgen/workflows/factory_test.py @@ -0,0 +1,29 @@ +import pytest + +from ..config.schema import AlexandriaWorkflowConfig, PodcastConfig +from .alexandria import AlexandriaWorkflow +from .factory import get_workflow + + +def test_get_workflow_returns_alexandria(): + """Tests that get_workflow creates an AlexandriaWorkflow instance with correct config""" + # Given + workflow_config = AlexandriaWorkflowConfig.model_construct(name="alexandria") + config = PodcastConfig.model_construct(workflow=workflow_config) + + # When + workflow = get_workflow(config) + + # Then + assert isinstance(workflow, AlexandriaWorkflow) + + +def test_get_workflow_raises_for_unsupported_workflow(): + """Tests that get_workflow raises NotImplementedError for unsupported workflows""" + # Given + workflow_config = AlexandriaWorkflowConfig.model_construct(name="unsupported") + config = PodcastConfig.model_construct(workflow=workflow_config) + + # When/Then + with pytest.raises(NotImplementedError, match="Unsupported workflow: unsupported"): + get_workflow(config) From fd3835e3bb391b300a2426e6a53139e78251e1cb Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Sat, 14 Dec 2024 12:38:34 +0530 Subject: [PATCH 06/10] separate pytest fixtures --- conftest.py | 5 +++ gyandex/cli/podgen_test.py | 31 ++++++++++++++ gyandex/podgen/engine/publisher_test.py | 38 +---------------- gyandex/podgen/engine/test_fixtures.py | 39 ++++++++++++++++++ gyandex/podgen/feed/models_test.py | 53 +----------------------- gyandex/podgen/feed/test_fixtures.py | 54 +++++++++++++++++++++++++ gyandex/podgen/storage/s3_test.py | 51 +---------------------- gyandex/podgen/storage/test_fixtures.py | 54 +++++++++++++++++++++++++ 8 files changed, 186 insertions(+), 139 deletions(-) create mode 100644 conftest.py create mode 100644 gyandex/cli/podgen_test.py create mode 100644 gyandex/podgen/engine/test_fixtures.py create mode 100644 gyandex/podgen/feed/test_fixtures.py create mode 100644 gyandex/podgen/storage/test_fixtures.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..4278925 --- /dev/null +++ b/conftest.py @@ -0,0 +1,5 @@ +pytest_plugins = [ + "gyandex.podgen.storage.test_fixtures", + "gyandex.podgen.feed.test_fixtures", + "gyandex.podgen.engine.test_fixtures", +] diff --git a/gyandex/cli/podgen_test.py b/gyandex/cli/podgen_test.py new file mode 100644 index 0000000..ceb7cc6 --- /dev/null +++ b/gyandex/cli/podgen_test.py @@ -0,0 +1,31 @@ +from unittest.mock import Mock, patch + +import pytest + +from gyandex.cli.podgen import main + + +def test_cli_help_command(): + """Tests that help command prints help message and exits""" + # When + with ( + patch("argparse.ArgumentParser.parse_args", return_value=Mock(config_path="--help")), + patch("argparse.ArgumentParser.print_help") as mock_help, + ): + main() + + # Then + mock_help.assert_called_once() + + +def test_invalid_config_path(): + """Tests handling of invalid configuration file path""" + # Given + invalid_path = "nonexistent.yaml" + + # When/Then + with ( + pytest.raises(FileNotFoundError), + patch("argparse.ArgumentParser.parse_args", return_value=Mock(config_path=invalid_path)), + ): + main() diff --git a/gyandex/podgen/engine/publisher_test.py b/gyandex/podgen/engine/publisher_test.py index 76cafa8..3e53f64 100644 --- a/gyandex/podgen/engine/publisher_test.py +++ b/gyandex/podgen/engine/publisher_test.py @@ -1,42 +1,6 @@ -from unittest.mock import Mock, patch - import pytest -from ..storage.s3 import S3CompatibleStorage -from .publisher import PodcastMetadata, PodcastPublisher - - -@pytest.fixture -def mock_storage(): - return Mock(spec=S3CompatibleStorage) - - -@pytest.fixture -def orchestrator(mock_storage, test_db): - return PodcastPublisher( - storage=mock_storage, - db=test_db, - base_url="https://example.com", - audio_prefix="episodes", - feed_prefix="feeds", - ) - - -@pytest.fixture -def sample_audio(tmp_path): - audio_path = tmp_path / "test.mp3" - audio_path.write_bytes(b"fake mp3 content") - return str(audio_path) - - -@pytest.fixture -def mock_mutagen(): - with patch("mutagen.File") as mock_file: - mock_audio = Mock() - mock_audio.info.length = 300 - mock_audio.mime = ["audio/mpeg"] - mock_file.return_value = mock_audio - yield mock_file +from .publisher import PodcastMetadata def test_create_feed(orchestrator, mock_storage): diff --git a/gyandex/podgen/engine/test_fixtures.py b/gyandex/podgen/engine/test_fixtures.py new file mode 100644 index 0000000..f672fd6 --- /dev/null +++ b/gyandex/podgen/engine/test_fixtures.py @@ -0,0 +1,39 @@ +from unittest.mock import Mock, patch + +import pytest + +from ..storage.s3 import S3CompatibleStorage +from .publisher import PodcastPublisher + + +@pytest.fixture +def mock_storage(): + return Mock(spec=S3CompatibleStorage) + + +@pytest.fixture +def orchestrator(mock_storage, test_db): + return PodcastPublisher( + storage=mock_storage, + db=test_db, + base_url="https://example.com", + audio_prefix="episodes", + feed_prefix="feeds", + ) + + +@pytest.fixture +def sample_audio(tmp_path): + audio_path = tmp_path / "test.mp3" + audio_path.write_bytes(b"fake mp3 content") + return str(audio_path) + + +@pytest.fixture +def mock_mutagen(): + with patch("mutagen.File") as mock_file: + mock_audio = Mock() + mock_audio.info.length = 300 + mock_audio.mime = ["audio/mpeg"] + mock_file.return_value = mock_audio + yield mock_file diff --git a/gyandex/podgen/feed/models_test.py b/gyandex/podgen/feed/models_test.py index 2213715..a080ea7 100644 --- a/gyandex/podgen/feed/models_test.py +++ b/gyandex/podgen/feed/models_test.py @@ -1,57 +1,6 @@ -import os - import pytest -from .models import Feed, PodcastDB - - -@pytest.fixture -def test_db(): - """Create a temporary test database""" - db_path = "test_podcast.db" - db = PodcastDB(db_path) - yield db - os.remove(db_path) - - -@pytest.fixture -def db_session(test_db): - """Create a database session for testing""" - Session = test_db.session - with Session() as session: - yield session - - -@pytest.fixture -def sample_feed_data(): - """Sample feed data for testing""" - return { - "slug": "test-podcast", - "title": "Test Podcast", - "description": "A test podcast", - "author": "Test Author", - "email": "test@example.com", - "website": "https://example.com", - "language": "en", - "copyright": "2024 Test Author", - "categories": "Technology,Education", - "explicit": "no", - } - - -@pytest.fixture -def sample_episode_data(): - """Sample episode data for testing""" - return { - "title": "Test Episode", - "description": "A test episode", - "audio_url": "https://example.com/episode1.mp3", - "guid": "episode-1", - "duration": 1800, - "file_size": 15000000, - "mime_type": "audio/mpeg", - "episode_type": "full", - } +from .models import Feed # Database Tests diff --git a/gyandex/podgen/feed/test_fixtures.py b/gyandex/podgen/feed/test_fixtures.py new file mode 100644 index 0000000..e117bbf --- /dev/null +++ b/gyandex/podgen/feed/test_fixtures.py @@ -0,0 +1,54 @@ +import os + +import pytest + +from .models import PodcastDB + + +@pytest.fixture +def test_db(): + """Create a temporary test database""" + db_path = "test_podcast.db" + db = PodcastDB(db_path) + yield db + os.remove(db_path) + + +@pytest.fixture +def db_session(test_db): + """Create a database session for testing""" + Session = test_db.session + with Session() as session: + yield session + + +@pytest.fixture +def sample_feed_data(): + """Sample feed data for testing""" + return { + "slug": "test-podcast", + "title": "Test Podcast", + "description": "A test podcast", + "author": "Test Author", + "email": "test@example.com", + "website": "https://example.com", + "language": "en", + "copyright": "2024 Test Author", + "categories": "Technology,Education", + "explicit": "no", + } + + +@pytest.fixture +def sample_episode_data(): + """Sample episode data for testing""" + return { + "title": "Test Episode", + "description": "A test episode", + "audio_url": "https://example.com/episode1.mp3", + "guid": "episode-1", + "duration": 1800, + "file_size": 15000000, + "mime_type": "audio/mpeg", + "episode_type": "full", + } diff --git a/gyandex/podgen/storage/s3_test.py b/gyandex/podgen/storage/s3_test.py index e9afca0..0ba8105 100644 --- a/gyandex/podgen/storage/s3_test.py +++ b/gyandex/podgen/storage/s3_test.py @@ -1,4 +1,4 @@ -from unittest.mock import ANY, Mock, patch +from unittest.mock import ANY, Mock import pytest from botocore.exceptions import ClientError @@ -6,55 +6,6 @@ from .s3 import S3CompatibleStorage -@pytest.fixture -def mock_s3_factory(): - with patch("boto3.client") as mock_client: - # Create a mock client instance - client = Mock() - mock_client.return_value = client - - # Mock the meta attributes - client.meta.endpoint_url = None - client.meta.region_name = "us-east-1" - - yield mock_client, client - - -@pytest.fixture -def mock_s3_storage(mock_s3_factory): - mock_client, _ = mock_s3_factory - return mock_client - - -@pytest.fixture -def mock_s3_client(mock_s3_factory): - _, client = mock_s3_factory - return client - - -@pytest.fixture -def storage(mock_s3_client): - return S3CompatibleStorage( - bucket="test-bucket", - access_key_id="test-key", - secret_access_key="test-secret", - region_name="us-east-1", - ) - - -@pytest.fixture -def r2_storage(mock_s3_client): - # Mock R2 endpoint - mock_s3_client.meta.endpoint_url = "https://test.r2.cloudflarestorage.com" - return S3CompatibleStorage( - bucket="test-bucket", - access_key_id="test-key", - secret_access_key="test-secret", - endpoint_url="https://test.r2.cloudflarestorage.com", - region_name="auto", - ) - - def test_initialization(mock_s3_storage): """Test storage initialization with different configurations""" # Test AWS S3 initialization diff --git a/gyandex/podgen/storage/test_fixtures.py b/gyandex/podgen/storage/test_fixtures.py new file mode 100644 index 0000000..7910be7 --- /dev/null +++ b/gyandex/podgen/storage/test_fixtures.py @@ -0,0 +1,54 @@ +from unittest.mock import Mock, patch + +import pytest + +from .s3 import S3CompatibleStorage + + +@pytest.fixture +def mock_s3_factory(): + with patch("boto3.client") as mock_client: + # Create a mock client instance + client = Mock() + mock_client.return_value = client + + # Mock the meta attributes + client.meta.endpoint_url = None + client.meta.region_name = "us-east-1" + + yield mock_client, client + + +@pytest.fixture +def mock_s3_storage(mock_s3_factory): + mock_client, _ = mock_s3_factory + return mock_client + + +@pytest.fixture +def mock_s3_client(mock_s3_factory): + _, client = mock_s3_factory + return client + + +@pytest.fixture +def storage(mock_s3_client): + return S3CompatibleStorage( + bucket="test-bucket", + access_key_id="test-key", + secret_access_key="test-secret", + region_name="us-east-1", + ) + + +@pytest.fixture +def r2_storage(mock_s3_client): + # Mock R2 endpoint + mock_s3_client.meta.endpoint_url = "https://test.r2.cloudflarestorage.com" + return S3CompatibleStorage( + bucket="test-bucket", + access_key_id="test-key", + secret_access_key="test-secret", + endpoint_url="https://test.r2.cloudflarestorage.com", + region_name="auto", + ) From 04794d9165979d954a4f31b882d68c11070ab01b Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Sat, 14 Dec 2024 12:56:52 +0530 Subject: [PATCH 07/10] Add checks in CI --- .github/workflows/ci.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..574d03b --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: CI Quality checks + +on: + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install Poetry + uses: snok/install-poetry@v1 + + - name: Install just + uses: extractions/setup-just@v1 + + - name: Install dependencies + run: poetry install + + - name: Run tests + run: poetry run just test + + - name: Run checks + run: poetry run just check From 33869e66e4bc7dcf45a45597ea27aa94a7974cf7 Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Sat, 14 Dec 2024 13:18:55 +0530 Subject: [PATCH 08/10] add google cloud authentication in CI --- .github/workflows/ci.yml | 31 ++++++++++++++++++++++++++----- README.md | 6 ------ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 574d03b..3a5d4ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: CI Quality checks +name: Quality Checks on: pull_request: @@ -9,22 +9,43 @@ jobs: steps: - uses: actions/checkout@v4 + - uses: 'google-github-actions/auth@v2' + with: + credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.11' + cache: 'pip' - name: Install Poetry uses: snok/install-poetry@v1 + with: + version: latest + + - name: Setup Poetry cache + uses: actions/cache@v3 + with: + path: ./.venv + key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }} - name: Install just uses: extractions/setup-just@v1 - - name: Install dependencies - run: poetry install + - name: Configure Poetry + run: | + poetry config virtualenvs.in-project true + poetry config virtualenvs.create true - - name: Run tests - run: poetry run just test + - name: Install dependencies + run: poetry install --no-interaction - name: Run checks run: poetry run just check + + - name: Run tests + run: poetry run just test diff --git a/README.md b/README.md index 2072f9f..ca77596 100644 --- a/README.md +++ b/README.md @@ -27,12 +27,6 @@ poetry install - Add `GOOGLE_API_KEY` in `.env` by generating the [API key from Google AI Studio](https://aistudio.google.com/app/apikey) - Login to Google Cloud using [the following instructions](https://cloud.google.com/text-to-speech/docs/create-audio-text-client-libraries) -## Running the Application -```shell -jupyter notebook -``` -Run the `main.ipynb` notebook - ## License This project is licensed under the **AGPL v3** for open-source use. For those wishing to use the software in proprietary applications without disclosing source code, a **commercial license** is available. From ceea573aa119dd9dcfbabbbb29bbaacb42c42588 Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Sat, 14 Dec 2024 13:42:50 +0530 Subject: [PATCH 09/10] Setup precommit hooks --- .pre-commit-config.yaml | 16 +++++++ poetry.lock | 94 ++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 + 3 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..68d2a26 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: + - repo: local + hooks: + - id: ruff-check + name: Run ruff check + entry: poetry run just check + language: system + pass_filenames: false + stages: [pre-commit] + + - id: pytest + name: Run pytest + entry: poetry run just test + language: system + pass_filenames: false + stages: [pre-commit] \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 5cb7900..7e61698 100644 --- a/poetry.lock +++ b/poetry.lock @@ -493,6 +493,17 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "cfgv" +version = "3.4.0" +description = "Validate configuration and produce human readable error messages." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, +] + [[package]] name = "charset-normalizer" version = "3.4.0" @@ -781,6 +792,17 @@ files = [ {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, ] +[[package]] +name = "distlib" +version = "0.3.9" +description = "Distribution utilities" +optional = false +python-versions = "*" +files = [ + {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"}, + {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, +] + [[package]] name = "executing" version = "2.1.0" @@ -1421,6 +1443,20 @@ files = [ {file = "httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f"}, ] +[[package]] +name = "identify" +version = "2.6.3" +description = "File identification library for Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "identify-2.6.3-py2.py3-none-any.whl", hash = "sha256:9edba65473324c2ea9684b1f944fe3191db3345e50b6d04571d10ed164f8d7bd"}, + {file = "identify-2.6.3.tar.gz", hash = "sha256:62f5dae9b5fef52c84cc188514e9ea4f3f636b1d8799ab5ebc475471f9e47a02"}, +] + +[package.extras] +license = ["ukkonen"] + [[package]] name = "idna" version = "3.10" @@ -3121,6 +3157,24 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "pre-commit" +version = "4.0.1" +description = "A framework for managing and maintaining multi-language pre-commit hooks." +optional = false +python-versions = ">=3.9" +files = [ + {file = "pre_commit-4.0.1-py2.py3-none-any.whl", hash = "sha256:efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878"}, + {file = "pre_commit-4.0.1.tar.gz", hash = "sha256:80905ac375958c0444c65e9cebebd948b3cdb518f335a091a670a89d652139d2"}, +] + +[package.dependencies] +cfgv = ">=2.0.0" +identify = ">=1.0.0" +nodeenv = ">=0.11.1" +pyyaml = ">=5.1" +virtualenv = ">=20.10.0" + [[package]] name = "prometheus-client" version = "0.21.0" @@ -3607,6 +3661,24 @@ pluggy = ">=1.5,<2" [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-asyncio" +version = "0.25.0" +description = "Pytest support for asyncio" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pytest_asyncio-0.25.0-py3-none-any.whl", hash = "sha256:db5432d18eac6b7e28b46dcd9b69921b55c3b1086e85febfe04e70b18d9e81b3"}, + {file = "pytest_asyncio-0.25.0.tar.gz", hash = "sha256:8c0610303c9e0442a5db8604505fc0f545456ba1528824842b37b4a626cbf609"}, +] + +[package.dependencies] +pytest = ">=8.2,<9" + +[package.extras] +docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1)"] +testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"] + [[package]] name = "pytest-cov" version = "5.0.0" @@ -4725,6 +4797,26 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "virtualenv" +version = "20.28.0" +description = "Virtual Python Environment builder" +optional = false +python-versions = ">=3.8" +files = [ + {file = "virtualenv-20.28.0-py3-none-any.whl", hash = "sha256:23eae1b4516ecd610481eda647f3a7c09aea295055337331bb4e6892ecce47b0"}, + {file = "virtualenv-20.28.0.tar.gz", hash = "sha256:2c9c3262bb8e7b87ea801d715fae4495e6032450c71d2309be9550e7364049aa"}, +] + +[package.dependencies] +distlib = ">=0.3.7,<1" +filelock = ">=3.12.2,<4" +platformdirs = ">=3.9.1,<5" + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] + [[package]] name = "wcwidth" version = "0.2.13" @@ -4888,4 +4980,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "592f5a9a27d922b3d5940c94210b79988cd92b6a3b1a6422bd03959c953fbf24" +content-hash = "82c90397bf4c785f1ba7239299aca8f24327b4bd62047b736b909e847ffda535" diff --git a/pyproject.toml b/pyproject.toml index 6b70856..65097c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,8 @@ rust-just = "^1.36.0" responses = "^0.25.3" ruff = "^0.8.2" pyright = "^1.1.390" +pytest-asyncio = "^0.25.0" +pre-commit = "^4.0.1" [project.classifiers] license = "OSI Approved :: GNU Affero General Public License v3 (AGPLv3)" From 9af0aa38eb8cb445f9549318da431f546f8d184b Mon Sep 17 00:00:00 2001 From: Dhruv Baldawa Date: Sat, 14 Dec 2024 13:46:19 +0530 Subject: [PATCH 10/10] Fix flaky test --- .github/workflows/ci.yml | 2 +- gyandex/podgen/storage/s3_test.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a5d4ed..1b9edc4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: version: latest - name: Setup Poetry cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ./.venv key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }} diff --git a/gyandex/podgen/storage/s3_test.py b/gyandex/podgen/storage/s3_test.py index 0ba8105..262ce28 100644 --- a/gyandex/podgen/storage/s3_test.py +++ b/gyandex/podgen/storage/s3_test.py @@ -134,7 +134,6 @@ def test_upload_file_content_type_guessing(storage, mock_s3_client, tmp_path): """Test content type guessing for different file types""" test_cases = [ ("test.mp3", "audio/mpeg"), - ("test.m4a", "audio/mp4a-latm"), ("test.wav", "audio/x-wav"), ("test.txt", "text/plain"), ("test.unknown", "application/octet-stream"),