greenbrettmichael · greenbrettmichael · Dec 30, 2025 · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025
diff --git a/app.py b/app.py
@@ -110,7 +110,10 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
         limit (int): The maximum number of videos to process.
         api_client (YouTubeTranscriptApi, optional): An instance of YouTubeTranscriptApi. If None, a new instance will be created.
     Returns:
-        List of dictionaries containing video_id, title, and transcript for each video with available transcripts.
+        List of dictionaries, each containing:
+            - video_id (str): The YouTube video ID
+            - title (str): The video title
+            - transcript (list[dict]): List of transcript segments, each with 'text' (str) and 'start' (float) keys
     """
 
     logging.info(f"Using YouTube search URL: {url}")
@@ -137,7 +140,6 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
         logging.info(f"Processing ({videos_processed + 1}/{limit}): {title} [{video_id}]")
         videos_processed += 1
 
-        transcript_text = ""
         try:
             transcript_list_obj = transcript_api.list(video_id)
 
@@ -157,8 +159,8 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
             # fetch() returns a list of dictionaries with 'text', 'start', and 'duration'
             fetched_transcript = transcript_obj.fetch()
 
-            # Combine the text parts into a single string, discarding timestamps for now
-            transcript_text = " ".join([item.text for item in fetched_transcript])
+            # Preserve transcript items with timestamps
+            transcript_items = [{"text": item.text, "start": item.start} for item in fetched_transcript]
 
         except TranscriptsDisabled:
             logging.info(f"Transcripts are disabled for video ID: {video_id}")
@@ -170,7 +172,7 @@ def get_recent_transcripts(url: str, limit: int = 10, api_client: YouTubeTranscr
             logging.info(f"Error retrieving transcript for video ID: {video_id}: {str(e)}")
             continue
 
-        results_data.append({"video_id": video_id, "title": title, "transcript": transcript_text})
+        results_data.append({"video_id": video_id, "title": title, "transcript": transcript_items})
 
     return results_data
 
@@ -222,8 +224,48 @@ def generate_newsletter_digest(json_data: list[dict], model: str = "gpt-5-mini-2
         context_block += f"--- VIDEO {i} ---\n"
         context_block += f"Title: {item['title']}\n"
         context_block += f"Video ID: {item['video_id']}\n"
-        # Truncate very long transcripts if necessary (e.g., to 25k chars) to fit context
-        context_block += f"Transcript: {item['transcript'][:25000]}\n\n"
+
+        # Format transcript with timestamps
+        transcript_data = item['transcript']
+        if isinstance(transcript_data, list):
+            # New format: list of dicts with text and start time
+            segments = []
+            for segment in transcript_data:
+                # Validate segment structure
+                if not isinstance(segment, dict):
+                    logging.warning("Skipping transcript segment with unexpected type: %r", type(segment))
+                    continue
+
+                start = segment.get("start")
+                text = segment.get("text")
+                if start is None or text is None:
+                    logging.warning("Skipping transcript segment missing 'start' or 'text': %r", segment)
+                    continue
+
+                try:
+                    timestamp_seconds = round(float(start))
+                except (TypeError, ValueError):
+                    logging.warning("Skipping transcript segment with non-numeric 'start': %r", segment)
+                    continue
+
+                segments.append(f"[{timestamp_seconds}s] {text} ")
+
+            transcript_formatted = "".join(segments)
+            # Truncate to avoid overly long prompts, matching old-format behavior
+            transcript_formatted = transcript_formatted[:25000]
+            context_block += f"Transcript (with timestamps in seconds): {transcript_formatted}\n\n"
+        else:
+            # Fallback for old format: plain text string (with type safety)
+            if isinstance(transcript_data, str):
+                safe_transcript = transcript_data
+            else:
+                logging.warning(
+                    "Unexpected transcript_data type %s for video %s; coercing to string.",
+                    type(transcript_data),
+                    item.get("video_id"),
+                )
+                safe_transcript = "" if transcript_data is None else str(transcript_data)
+            context_block += f"Transcript: {safe_transcript[:25000]}\n\n"
 
     # Define the System Prompt
     system_prompt = (
@@ -233,7 +275,7 @@ def generate_newsletter_digest(json_data: list[dict], model: str = "gpt-5-mini-2
 
     # Define the User Prompt
     user_prompt = f"""
-    Here are the transcripts from the most recent videos.
+    Here are the transcripts from the most recent videos with timestamps.
 
     Please write a Newsletter Digest in Markdown format.
 
@@ -249,10 +291,17 @@ def generate_newsletter_digest(json_data: list[dict], model: str = "gpt-5-mini-2
     Link: [Watch on YouTube](https://www.youtube.com/watch?v=<Video ID>)
     Key Takeaways:
 
-    - <Bullet 1: Specific, actionable detail>
-    - <Bullet 2: Specific, actionable detail>
+    - **[MM:SS](https://www.youtube.com/watch?v=<Video ID>&t=<seconds>s)** - <Bullet 1: Specific, actionable detail>
+    - **[MM:SS](https://www.youtube.com/watch?v=<Video ID>&t=<seconds>s)** - <Bullet 2: Specific, actionable detail>
     ... (Provide between 2 and 5 bullet points. Use fewer for short/simple videos, and more for dense/complex technical content.)
 
+    **IMPORTANT TIMESTAMP FORMATTING:**
+    - Each bullet point MUST start with a timestamp in the format [MM:SS] that links to that moment in the video.
+    - Convert the timestamp seconds from the transcript to MM:SS format (e.g., 125 seconds becomes 02:05).
+    - The timestamp should link to: https://www.youtube.com/watch?v=<Video ID>&t=<seconds>s
+    - Choose the timestamp that best represents when that specific takeaway is discussed in the video.
+    - Make the timestamp bold and followed by " - " before the bullet text.
+
     **(IMPORTANT: You must leave a blank line between 'Key Takeaways:' and the first bullet point so the list renders correctly.)**
     ---
 

diff --git a/tests/test_openai_digest.py b/tests/test_openai_digest.py
@@ -103,3 +103,66 @@ def test_custom_model_parameter(self, mock_openai_class, monkeypatch):
         # Check that the specific model was passed to the API
         call_args = mock_client.chat.completions.create.call_args
         assert call_args[1]["model"] == "gpt-4o-custom"
+
+    @patch("app.OpenAI")
+    def test_timestamp_integration(self, mock_openai_class, monkeypatch):
+        """Test that transcript data with timestamps is correctly formatted in the prompt."""
+        monkeypatch.setenv("OPENAI_API_KEY", "fake-test-key")
+
+        mock_client = mock_openai_class.return_value
+        mock_response = MagicMock()
+        mock_response.choices[0].message.content = "Success"
+        mock_client.chat.completions.create.return_value = mock_response
+
+        # Test data with new timestamp format
+        fake_data = [
+            {
+                "title": "Test Video",
+                "video_id": "abc123",
+                "transcript": [
+                    {"text": "Hello world", "start": 0},
+                    {"text": "This is a test", "start": 10.5},
+                    {"text": "End of video", "start": 125.7},
+                ],
+            }
+        ]
+
+        generate_newsletter_digest(fake_data)
+
+        # Verify the prompt includes timestamp instructions
+        call_args = mock_client.chat.completions.create.call_args
+        messages = call_args[1]["messages"]
+        user_prompt = messages[1]["content"]
+
+        # Check for timestamp formatting instructions
+        assert "MM:SS" in user_prompt
+        assert "timestamp" in user_prompt.lower()
+        assert "&t=" in user_prompt
+
+        # Check that transcript data includes timestamps
+        assert "[0s] Hello world" in user_prompt
+        assert "[10s] This is a test" in user_prompt
+        assert "[126s] End of video" in user_prompt  # 125.7 rounds to 126
+
+    @patch("app.OpenAI")
+    def test_backward_compatibility_with_string_transcript(self, mock_openai_class, monkeypatch):
+        """Test that old format (string transcript) still works as fallback."""
+        monkeypatch.setenv("OPENAI_API_KEY", "fake-test-key")
+
+        mock_client = mock_openai_class.return_value
+        mock_response = MagicMock()
+        mock_response.choices[0].message.content = "Success"
+        mock_client.chat.completions.create.return_value = mock_response
+
+        # Test data with old string format
+        fake_data = [{"title": "Test", "video_id": "123", "transcript": "This is plain text"}]
+
+        generate_newsletter_digest(fake_data)
+
+        # Verify it doesn't crash and uses the fallback
+        call_args = mock_client.chat.completions.create.call_args
+        messages = call_args[1]["messages"]
+        user_prompt = messages[1]["content"]
+
+        # Should still contain the transcript text
+        assert "This is plain text" in user_prompt
diff --git a/tests/test_transcripts.py b/tests/test_transcripts.py
@@ -30,7 +30,7 @@ def test_search_and_fetch_english_success(self, mock_scrapetube, mock_api_client
         # Verify
         assert len(results) == 2
         assert results[0]["video_id"] == "vid_1"
-        assert results[0]["transcript"] == "Hello world"
+        assert results[0]["transcript"] == [{"text": "Hello world", "start": 0}]
 
         # Check that we specifically looked for English
         mock_api_client.list.assert_any_call("vid_1")
@@ -48,15 +48,15 @@ def test_search_fallback_language(self, mock_scrapetube, mock_api_client, mock_s
         # Simulate: Fallback (Spanish) found via iterator
         mock_spanish_transcript = MagicMock()
         mock_spanish_transcript.language_code = "es"
-        mock_spanish_transcript.fetch.return_value = [SimpleNamespace(text="Hola mundo")]
+        mock_spanish_transcript.fetch.return_value = [SimpleNamespace(text="Hola mundo", start=0)]
         mock_list.__iter__.return_value = iter([mock_spanish_transcript])
 
         # Execute
         results = get_recent_transcripts("test", limit=1, api_client=mock_api_client)
 
         # Verify
         assert len(results) == 1
-        assert results[0]["transcript"] == "Hola mundo"
+        assert results[0]["transcript"] == [{"text": "Hola mundo", "start": 0}]
 
 
 class TestTranscriptsEdgeCases: