Skip to content
This repository was archived by the owner on Apr 8, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 59 additions & 10 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
limit (int): The maximum number of videos to process.
api_client (YouTubeTranscriptApi, optional): An instance of YouTubeTranscriptApi. If None, a new instance will be created.
Returns:
List of dictionaries containing video_id, title, and transcript for each video with available transcripts.
List of dictionaries, each containing:
- video_id (str): The YouTube video ID
- title (str): The video title
- transcript (list[dict]): List of transcript segments, each with 'text' (str) and 'start' (float) keys
"""

logging.info(f"Using YouTube search URL: {url}")
Expand All @@ -137,7 +140,6 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
logging.info(f"Processing ({videos_processed + 1}/{limit}): {title} [{video_id}]")
videos_processed += 1

transcript_text = ""
try:
transcript_list_obj = transcript_api.list(video_id)

Expand All @@ -157,8 +159,8 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
# fetch() returns a list of dictionaries with 'text', 'start', and 'duration'
fetched_transcript = transcript_obj.fetch()

# Combine the text parts into a single string, discarding timestamps for now
transcript_text = " ".join([item.text for item in fetched_transcript])
# Preserve transcript items with timestamps
transcript_items = [{"text": item.text, "start": item.start} for item in fetched_transcript]

except TranscriptsDisabled:
logging.info(f"Transcripts are disabled for video ID: {video_id}")
Expand All @@ -170,7 +172,7 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
logging.info(f"Error retrieving transcript for video ID: {video_id}: {str(e)}")
continue

results_data.append({"video_id": video_id, "title": title, "transcript": transcript_text})
results_data.append({"video_id": video_id, "title": title, "transcript": transcript_items})

Copilot AI Dec 30, 2025

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function docstring for get_recent_transcripts should be updated to reflect that the transcript field now contains structured data (list of dictionaries with 'text' and 'start' keys) rather than a plain text string. This is a breaking change in the return type that should be documented.

Copilot uses AI. Check for mistakes.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot please resolve

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated in commit 8352bfb. The docstring now documents that transcript is a list of dictionaries with 'text' (str) and 'start' (float) keys.


return results_data

Expand Down Expand Up @@ -222,8 +224,48 @@ def generate_newsletter_digest(json_data: list[dict], model: str = "gpt-5-mini-2
context_block += f"--- VIDEO {i} ---\n"
context_block += f"Title: {item['title']}\n"
context_block += f"Video ID: {item['video_id']}\n"
# Truncate very long transcripts if necessary (e.g., to 25k chars) to fit context
context_block += f"Transcript: {item['transcript'][:25000]}\n\n"

# Format transcript with timestamps
transcript_data = item['transcript']
if isinstance(transcript_data, list):
# New format: list of dicts with text and start time
segments = []
for segment in transcript_data:
# Validate segment structure
if not isinstance(segment, dict):
logging.warning("Skipping transcript segment with unexpected type: %r", type(segment))
continue

start = segment.get("start")
text = segment.get("text")
if start is None or text is None:
logging.warning("Skipping transcript segment missing 'start' or 'text': %r", segment)
continue

try:
timestamp_seconds = round(float(start))
except (TypeError, ValueError):
logging.warning("Skipping transcript segment with non-numeric 'start': %r", segment)
continue

segments.append(f"[{timestamp_seconds}s] {text} ")

transcript_formatted = "".join(segments)
# Truncate to avoid overly long prompts, matching old-format behavior
transcript_formatted = transcript_formatted[:25000]
context_block += f"Transcript (with timestamps in seconds): {transcript_formatted}\n\n"
else:
# Fallback for old format: plain text string (with type safety)
if isinstance(transcript_data, str):
safe_transcript = transcript_data
else:
logging.warning(
"Unexpected transcript_data type %s for video %s; coercing to string.",
type(transcript_data),
item.get("video_id"),
)
safe_transcript = "" if transcript_data is None else str(transcript_data)
context_block += f"Transcript: {safe_transcript[:25000]}\n\n"

# Define the System Prompt
system_prompt = (
Expand All @@ -233,7 +275,7 @@ def generate_newsletter_digest(json_data: list[dict], model: str = "gpt-5-mini-2

# Define the User Prompt
user_prompt = f"""
Here are the transcripts from the most recent videos.
Here are the transcripts from the most recent videos with timestamps.

Please write a Newsletter Digest in Markdown format.

Expand All @@ -249,10 +291,17 @@ def generate_newsletter_digest(json_data: list[dict], model: str = "gpt-5-mini-2
Link: [Watch on YouTube](https://www.youtube.com/watch?v=<Video ID>)
Key Takeaways:

- <Bullet 1: Specific, actionable detail>
- <Bullet 2: Specific, actionable detail>
- **[MM:SS](https://www.youtube.com/watch?v=<Video ID>&t=<seconds>s)** - <Bullet 1: Specific, actionable detail>
- **[MM:SS](https://www.youtube.com/watch?v=<Video ID>&t=<seconds>s)** - <Bullet 2: Specific, actionable detail>
... (Provide between 2 and 5 bullet points. Use fewer for short/simple videos, and more for dense/complex technical content.)

**IMPORTANT TIMESTAMP FORMATTING:**
- Each bullet point MUST start with a timestamp in the format [MM:SS] that links to that moment in the video.
- Convert the timestamp seconds from the transcript to MM:SS format (e.g., 125 seconds becomes 02:05).
- The timestamp should link to: https://www.youtube.com/watch?v=<Video ID>&t=<seconds>s
- Choose the timestamp that best represents when that specific takeaway is discussed in the video.
- Make the timestamp bold and followed by " - " before the bullet text.

**(IMPORTANT: You must leave a blank line between 'Key Takeaways:' and the first bullet point so the list renders correctly.)**
---

Expand Down
63 changes: 63 additions & 0 deletions tests/test_openai_digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,66 @@ def test_custom_model_parameter(self, mock_openai_class, monkeypatch):
# Check that the specific model was passed to the API
call_args = mock_client.chat.completions.create.call_args
assert call_args[1]["model"] == "gpt-4o-custom"

@patch("app.OpenAI")
def test_timestamp_integration(self, mock_openai_class, monkeypatch):
"""Test that transcript data with timestamps is correctly formatted in the prompt."""
monkeypatch.setenv("OPENAI_API_KEY", "fake-test-key")

mock_client = mock_openai_class.return_value
mock_response = MagicMock()
mock_response.choices[0].message.content = "Success"
mock_client.chat.completions.create.return_value = mock_response

# Test data with new timestamp format
fake_data = [
{
"title": "Test Video",
"video_id": "abc123",
"transcript": [
{"text": "Hello world", "start": 0},
{"text": "This is a test", "start": 10.5},
{"text": "End of video", "start": 125.7},
],
}
]

generate_newsletter_digest(fake_data)

# Verify the prompt includes timestamp instructions
call_args = mock_client.chat.completions.create.call_args
messages = call_args[1]["messages"]
user_prompt = messages[1]["content"]

# Check for timestamp formatting instructions
assert "MM:SS" in user_prompt
assert "timestamp" in user_prompt.lower()
assert "&t=" in user_prompt

# Check that transcript data includes timestamps
assert "[0s] Hello world" in user_prompt
assert "[10s] This is a test" in user_prompt
assert "[126s] End of video" in user_prompt # 125.7 rounds to 126

@patch("app.OpenAI")
def test_backward_compatibility_with_string_transcript(self, mock_openai_class, monkeypatch):
"""Test that old format (string transcript) still works as fallback."""
monkeypatch.setenv("OPENAI_API_KEY", "fake-test-key")

mock_client = mock_openai_class.return_value
mock_response = MagicMock()
mock_response.choices[0].message.content = "Success"
mock_client.chat.completions.create.return_value = mock_response

# Test data with old string format
fake_data = [{"title": "Test", "video_id": "123", "transcript": "This is plain text"}]

generate_newsletter_digest(fake_data)

# Verify it doesn't crash and uses the fallback
call_args = mock_client.chat.completions.create.call_args
messages = call_args[1]["messages"]
user_prompt = messages[1]["content"]

# Should still contain the transcript text
assert "This is plain text" in user_prompt
6 changes: 3 additions & 3 deletions tests/test_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_search_and_fetch_english_success(self, mock_scrapetube, mock_api_client
# Verify
assert len(results) == 2
assert results[0]["video_id"] == "vid_1"
assert results[0]["transcript"] == "Hello world"
assert results[0]["transcript"] == [{"text": "Hello world", "start": 0}]

# Check that we specifically looked for English
mock_api_client.list.assert_any_call("vid_1")
Expand All @@ -48,15 +48,15 @@ def test_search_fallback_language(self, mock_scrapetube, mock_api_client, mock_s
# Simulate: Fallback (Spanish) found via iterator
mock_spanish_transcript = MagicMock()
mock_spanish_transcript.language_code = "es"
mock_spanish_transcript.fetch.return_value = [SimpleNamespace(text="Hola mundo")]
mock_spanish_transcript.fetch.return_value = [SimpleNamespace(text="Hola mundo", start=0)]
mock_list.__iter__.return_value = iter([mock_spanish_transcript])

# Execute
results = get_recent_transcripts("test", limit=1, api_client=mock_api_client)

# Verify
assert len(results) == 1
assert results[0]["transcript"] == "Hola mundo"
assert results[0]["transcript"] == [{"text": "Hola mundo", "start": 0}]


class TestTranscriptsEdgeCases:
Expand Down