diff --git a/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml b/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml index 0773a4f0..803acbfb 100644 --- a/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml +++ b/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml @@ -10,33 +10,23 @@ mc:Ignorable="d"> - - - - - - - - - - + + + + + - + + + + diff --git a/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml.cs b/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml.cs index d3396077..d7b62572 100644 --- a/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml.cs +++ b/AIDevGallery/Samples/Open Source Models/Embeddings/RetrievalAugmentedGeneration.xaml.cs @@ -14,7 +14,6 @@ using Microsoft.UI.Xaml.Navigation; using System; using System.Collections.Generic; -using System.Diagnostics; using System.Linq; using System.Text; using System.Threading; @@ -60,6 +59,7 @@ internal sealed partial class RetrievalAugmentedGeneration : BaseSamplePage private StorageFile? _pdfFile; private InMemoryRandomAccessStream? _inMemoryRandomAccessStream; private CancellationTokenSource? _cts; + private bool _isCancellable; private List? selectedPages; private int selectedPageIndex = -1; @@ -69,13 +69,10 @@ public class PdfPageData { [VectorStoreRecordKey] public required int Key { get; init; } - [VectorStoreRecordData] public required uint Page { get; init; } - [VectorStoreRecordData] public required string Text { get; init; } - [VectorStoreRecordVector(384, DistanceFunction.CosineSimilarity)] public required ReadOnlyMemory Vector { get; init; } } @@ -83,10 +80,7 @@ public class PdfPageData public RetrievalAugmentedGeneration() { this.InitializeComponent(); - this.Unloaded += (s, e) => - { - CleanUp(); - }; + this.Unloaded += (s, e) => CleanUp(); this.Loaded += (s, e) => Page_Loaded(); // } @@ -129,6 +123,201 @@ private void CleanUp() _cts = null; } + private async void IndexPDFButton_Click(object sender, RoutedEventArgs e) + { + if (_isCancellable) + { + _cts?.Cancel(); + _cts = null; + ToSelectState(); + return; + } + + if (_embeddings == null) + { + return; + } + + ToSelectingState(); + + _pdfFile = await SelectPDFFromFileSystem(); + if (_pdfFile == null) + { + ToSelectState(); + return; + } + + await IndexPDF(); + } + + private async Task IndexPDF() + { + if (_pdfFile == null || _embeddings == null) + { + return; + } + + ToIndexingState(); + _cts = new CancellationTokenSource(); + CancellationToken ct = _cts.Token; + +#pragma warning disable SKEXP0020 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + _vectorStore = new InMemoryVectorStore(); +#pragma warning restore SKEXP0020 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + _pdfPages = _vectorStore.GetCollection("pages"); + await _pdfPages.CreateCollectionIfNotExistsAsync(ct).ConfigureAwait(false); + int chunksProcessedCount = 0; + + try + { + await Task.Run( + async () => + { + using PdfDocument document = PdfDocument.Open(_pdfFile.Path); + foreach (var page in document.GetPages()) + { + string pageText = string.Join(" ", page.GetWords()); + + if(pageText == string.Empty) + { + continue; + } + + List<(string Text, uint Page)> pageChunks = SplitInChunks((pageText, (uint)page.Number), 512).ToList(); + int i = 0; + await foreach(var embedding in _embeddings.GenerateStreamingAsync(pageChunks.Select(c => c.Text), null, ct).ConfigureAwait(false)) + { + await _pdfPages.UpsertAsync( + new PdfPageData + { + Key = chunksProcessedCount, + Page = pageChunks[i].Page, + Text = pageChunks[i].Text, + Vector = embedding.Vector + }, + null, + ct).ConfigureAwait(false); + i++; + chunksProcessedCount++; + } + + DispatcherQueue.TryEnqueue(() => + { + UpdateProgress(page.Number, document.NumberOfPages); + }); + } + }, + ct); + + ct.ThrowIfCancellationRequested(); + } + catch (Exception ex) + { + ToSelectState(); + + if (ex is not OperationCanceledException) + { + PdfProblemTextBlock.Text = "We weren't able to read this PDF. Please try another."; + } + + return; + } + + if(chunksProcessedCount == 0) + { + ToSelectState(); + PdfProblemTextBlock.Text = "We weren't able to read this PDF. Please try another."; + return; + } + + DispatcherQueue.TryEnqueue(() => + { + ShowPDFPage.IsEnabled = true; + IndexPDFGrid.Visibility = Visibility.Collapsed; + ChatGrid.Visibility = Visibility.Visible; + SelectNewPDFButton.Visibility = Visibility.Visible; + }); + } + + private async Task DoRAG() + { + if (_embeddings == null || _chatClient == null || _pdfPages == null) + { + return; + } + + if (_cts != null) + { + _cts.Cancel(); + _cts = null; + AskSLMButton.Content = "Answer"; + return; + } + + selectedPageIndex = 0; + AskSLMButton.Content = "Cancel"; + SearchTextBox.IsEnabled = false; + _cts = new CancellationTokenSource(); + + const string systemPrompt = "You are a knowledgeable assistant specialized in answering questions based solely on information from specific PDF pages provided by the user. " + + "When responding, focus on delivering clear, accurate answers drawn only from the content in these pages, avoiding outside information or assumptions."; + + var searchPrompt = this.SearchTextBox.Text; + + // 4) Search the chunks using the user's prompt, with the same model used for indexing + var searchVector = await _embeddings.GenerateAsync([searchPrompt], null, _cts.Token); + var vectorSearchResults = await _pdfPages.VectorizedSearchAsync( + searchVector[0].Vector, + new VectorSearchOptions + { + Top = 5, + VectorPropertyName = nameof(PdfPageData.Vector) + }, + _cts.Token); + + var contents = vectorSearchResults.Results.ToBlockingEnumerable() + .Select(r => r.Record) + .DistinctBy(c => c.Page) + .OrderBy(c => c.Page); + + selectedPages = contents.Select(c => c.Page).ToList(); + + PagesUsedRun.Text = $"Using page(s) : {string.Join(", ", selectedPages)}"; + InformationSV.Visibility = Visibility.Visible; + + var pagesChunks = contents.GroupBy(c => c.Page) + .Select(g => $"Page {g.Key}: {string.Join(' ', g.Select(c => c.Text))}"); + + AnswerRun.Text = string.Empty; + var fullResult = string.Empty; + + await Task.Run( + async () => + { + await foreach (var partialResult in _chatClient.CompleteStreamingAsync( + [ + new ChatMessage(ChatRole.System, systemPrompt), + .. pagesChunks.Select(c => new ChatMessage(ChatRole.User, c)), + new ChatMessage(ChatRole.User, searchPrompt), + ], + _chatOptions, + _cts.Token)) + { + fullResult += partialResult; + DispatcherQueue.TryEnqueue(() => + { + AnswerRun.Text = fullResult; + }); + } + }, + _cts.Token); + + _cts = null; + + AskSLMButton.Content = "Answer"; + SearchTextBox.IsEnabled = true; + } + private void Grid_Loaded(object sender, RoutedEventArgs e) { searchTextBoxInitialText = SearchTextBox.Text; @@ -161,6 +350,7 @@ private async Task UpdatePdfImageAsync() PdfImage.Source = bitmapImage; PdfImageGrid.Visibility = Visibility.Visible; + SelectNewPDFButton.Visibility = Visibility.Collapsed; UpdatePreviousAndNextPageButtonEnabled(); }); } @@ -202,6 +392,7 @@ private async void ShowPDFPage_Click(object sender, RoutedEventArgs e) private void PdfImage_Tapped(object sender, TappedRoutedEventArgs e) { PdfImageGrid.Visibility = Visibility.Collapsed; + SelectNewPDFButton.Visibility = Visibility.Visible; } private async void PreviousPageButton_Click(object sender, RoutedEventArgs e) @@ -229,169 +420,7 @@ private async void NextPageButton_Click(object sender, RoutedEventArgs e) private void ClosePdfButton_Click(object sender, RoutedEventArgs e) { PdfImageGrid.Visibility = Visibility.Collapsed; - } - - private async void IndexPDFButton_Click(object sender, RoutedEventArgs e) - { - if (_embeddings == null) - { - return; - } - - IndexPDFButton.IsEnabled = false; - LoadPDFProgressRing.IsActive = true; - LoadPDFProgressRing.Visibility = Visibility.Visible; - PdfProblemTextBlock.Text = string.Empty; - IndexPDFText.Text = "Selecting PDF..."; - - var window = new Window(); - var hwnd = WinRT.Interop.WindowNative.GetWindowHandle(window); - - var picker = new FileOpenPicker(); - WinRT.Interop.InitializeWithWindow.Initialize(picker, hwnd); - - // Set the file type filter - picker.FileTypeFilter.Add(".pdf"); - - // Pick a file - _pdfFile = await picker.PickSingleFileAsync(); - if (_pdfFile == null) - { - IndexPDFButton.IsEnabled = true; - LoadPDFProgressRing.IsActive = false; - LoadPDFProgressRing.Visibility = Visibility.Collapsed; - IndexPDFProgressStackPanel.Visibility = Visibility.Collapsed; - return; - } - - IndexPDFText.Text = "Indexing PDF..."; - - var contents = new List<(string Text, uint Page)>(); - - // 1) Read the PDF file - using (PdfDocument document = PdfDocument.Open(_pdfFile.Path)) - { - foreach (var page in document.GetPages()) - { - var words = page.GetWords(); - var builder = string.Join(" ", words); - - var range = builder - .Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries) - .Where(x => !string.IsNullOrWhiteSpace(x)) - .Select(x => ((string Text, uint Page))(x, page.Number)); - - contents.AddRange(range); - } - } - - if (contents.Count == 0) - { - IndexPDFButton.IsEnabled = true; - LoadPDFProgressRing.IsActive = false; - LoadPDFProgressRing.Visibility = Visibility.Collapsed; - IndexPDFText.Text = "Select PDF"; - PdfProblemTextBlock.Text = "We weren't able to read this PDF. Please try another."; - return; - } - - // 2) Split the text into chunks to make sure they are - // smaller than what the Embeddings model supports - var maxLength = 1024 / 2; - List<(string Text, uint Page)> chunkedContents = []; - foreach (var content in contents) - { - chunkedContents.AddRange(SplitInChunks(content, maxLength)); - } - - contents = chunkedContents; - - IndexPDFProgressBar.Minimum = 0; - IndexPDFProgressBar.Maximum = contents.Count; - IndexPDFProgressBar.Value = 0; - - Stopwatch sw = Stopwatch.StartNew(); - - void UpdateProgress(float progress) - { - var elapsed = sw.Elapsed; - if (progress == 0) - { - progress = 0.0001f; - } - - var remaining = TimeSpan.FromSeconds((long)(elapsed.TotalSeconds / progress * (1 - progress) / 5) * 5); - - LoadPDFProgressRing.Value = progress * contents.Count; - IndexPDFText.Text = $"Indexing PDF... {progress:P0} ({remaining})"; - } - - if (_cts != null) - { - _cts.Cancel(); - _cts = null; - AskSLMButton.Content = "Answer"; - return; - } - - _cts = new CancellationTokenSource(); - - // 3) Index the chunks -#pragma warning disable SKEXP0020 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. - _vectorStore = new InMemoryVectorStore(); -#pragma warning restore SKEXP0020 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. - _pdfPages = _vectorStore.GetCollection("pages"); - await _pdfPages.CreateCollectionIfNotExistsAsync(_cts.Token).ConfigureAwait(false); - - var total = contents.Count; - try - { - int i = 0; - await foreach (var embedding in _embeddings.GenerateStreamingAsync(contents.Select(c => c.Text), null, _cts.Token).ConfigureAwait(false)) - { - await _pdfPages.UpsertAsync( - new PdfPageData - { - Key = i, - Page = contents[i].Page, - Text = contents[i].Text, - Vector = embedding.Vector - }, - null, - _cts.Token).ConfigureAwait(false); - DispatcherQueue.TryEnqueue(() => - { - UpdateProgress((float)i / total); - }); - i++; - } - } - catch (OperationCanceledException) - { - DispatcherQueue.TryEnqueue(() => - { - IndexPDFButton.IsEnabled = true; - LoadPDFProgressRing.IsActive = false; - LoadPDFProgressRing.Visibility = Visibility.Collapsed; - IndexPDFProgressStackPanel.Visibility = Visibility.Collapsed; - IndexPDFText.Text = "Select PDF"; - }); - - return; - } - finally - { - _cts = null; - } - - DispatcherQueue.TryEnqueue(() => - { - ShowPDFPage.IsEnabled = true; - IndexPDFText.Text = "Indexing PDF... Done!"; - IndexPDFProgressStackPanel.Visibility = Visibility.Collapsed; - IndexPDFGrid.Visibility = Visibility.Collapsed; - ChatGrid.Visibility = Visibility.Visible; - }); + SelectNewPDFButton.Visibility = Visibility.Visible; } private IEnumerable<(string Text, uint Page)> SplitInChunks((string Text, uint Page) input, int maxLength) @@ -442,6 +471,39 @@ await _pdfPages.UpsertAsync( } } + private void ToSelectState() + { + _pdfPages?.DeleteCollectionAsync(); + HideProgress(); + _isCancellable = false; + ShowPDFPage.IsEnabled = false; + SelectNewPDFButton.Visibility = Visibility.Collapsed; + IndexPDFGrid.Visibility = Visibility.Visible; + ChatGrid.Visibility = Visibility.Collapsed; + IndexPDFButton.IsEnabled = true; + LoadPDFProgressRing.IsActive = false; + LoadPDFProgressRing.Visibility = Visibility.Collapsed; + IndexPDFText.Text = "Select PDF"; + } + + private void ToSelectingState() + { + IndexPDFButton.IsEnabled = false; + LoadPDFProgressRing.IsActive = true; + LoadPDFProgressRing.Visibility = Visibility.Visible; + PdfProblemTextBlock.Text = string.Empty; + IndexPDFText.Text = "Selecting PDF..."; + } + + private void ToIndexingState() + { + IndexPDFButton.IsEnabled = true; + IndexPDFText.Text = "Cancel"; + _isCancellable = true; + ProgressPanel.Visibility = Visibility.Visible; + PdfProblemTextBlock.Text = string.Empty; + } + private async void AskSLMButton_Click(object sender, RoutedEventArgs e) { if (SearchTextBox.Text.Length > 0) @@ -458,82 +520,40 @@ private async void TextBox_KeyUp(object sender, KeyRoutedEventArgs e) } } - private async Task DoRAG() + private async Task SelectPDFFromFileSystem() { - if (_embeddings == null || _chatClient == null || _pdfPages == null) - { - return; - } + var window = new Window(); + var hwnd = WinRT.Interop.WindowNative.GetWindowHandle(window); + var picker = new FileOpenPicker(); + WinRT.Interop.InitializeWithWindow.Initialize(picker, hwnd); + picker.FileTypeFilter.Add(".pdf"); + return await picker.PickSingleFileAsync(); + } - if (_cts != null) + private async void SelectNewPDF_Click(object sender, RoutedEventArgs e) + { + StorageFile pdfFile = await SelectPDFFromFileSystem(); + if(pdfFile != null) { - _cts.Cancel(); - _cts = null; - AskSLMButton.Content = "Answer"; - return; + _pdfFile = pdfFile; + ToSelectState(); + await IndexPDF(); } + } - selectedPageIndex = 0; - AskSLMButton.Content = "Cancel"; - SearchTextBox.IsEnabled = false; - _cts = new CancellationTokenSource(); - - const string systemPrompt = "You are a knowledgeable assistant specialized in answering questions based solely on information from specific PDF pages provided by the user. " + - "When responding, focus on delivering clear, accurate answers drawn only from the content in these pages, avoiding outside information or assumptions."; - - var searchPrompt = this.SearchTextBox.Text; - - // 4) Search the chunks using the user's prompt, with the same model used for indexing - var searchVector = await _embeddings.GenerateAsync([searchPrompt], null, _cts.Token); - var vectorSearchResults = await _pdfPages.VectorizedSearchAsync( - searchVector[0].Vector, - new VectorSearchOptions - { - Top = 5, - VectorPropertyName = nameof(PdfPageData.Vector) - }, - _cts.Token); - - var contents = vectorSearchResults.Results.ToBlockingEnumerable() - .Select(r => r.Record) - .DistinctBy(c => c.Page) - .OrderBy(c => c.Page); - - selectedPages = contents.Select(c => c.Page).ToList(); - - PagesUsedRun.Text = $"Using page(s) : {string.Join(", ", selectedPages)}"; - InformationSV.Visibility = Visibility.Visible; - - var pagesChunks = contents.GroupBy(c => c.Page) - .Select(g => $"Page {g.Key}: {string.Join(' ', g.Select(c => c.Text))}"); - - AnswerRun.Text = string.Empty; - var fullResult = string.Empty; - - await Task.Run( - async () => - { - await foreach (var partialResult in _chatClient.CompleteStreamingAsync( - [ - new ChatMessage(ChatRole.System, systemPrompt), - .. pagesChunks.Select(c => new ChatMessage(ChatRole.User, c)), - new ChatMessage(ChatRole.User, searchPrompt), - ], - _chatOptions, - _cts.Token)) - { - fullResult += partialResult; - DispatcherQueue.TryEnqueue(() => - { - AnswerRun.Text = fullResult; - }); - } - }, - _cts.Token); + private void UpdateProgress(int currentPage, int totalPages) + { + int progressValue = (int)Math.Floor((float)currentPage / (float)totalPages * 100); + string progressString = $"Indexed {currentPage} of {totalPages} pages ({progressValue}%)"; - _cts = null; + IndexingProgressBar.Value = progressValue; + ProgressStatusTextBlock.Text = progressString; + } - AskSLMButton.Content = "Answer"; - SearchTextBox.IsEnabled = true; + private void HideProgress() + { + ProgressPanel.Visibility = Visibility.Collapsed; + IndexingProgressBar.Value = 0; + ProgressStatusTextBlock.Text = string.Empty; } } \ No newline at end of file