diff --git a/LLMs/RAG/late-chunking/.github/workflows/ci.yaml b/LLMs/RAG/late-chunking/.github/workflows/ci.yaml new file mode 100644 index 0000000..b3dcce2 --- /dev/null +++ b/LLMs/RAG/late-chunking/.github/workflows/ci.yaml @@ -0,0 +1,31 @@ +name: Run Tests + +on: + pull_request: + types: [opened, synchronize, reopened] + push: + branches: + - main + +env: + JINA_API_TOKEN: ${{ secrets.JINA_API_TOKEN }} + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[dev] + + - name: Run tests + run: pytest tests diff --git a/LLMs/RAG/late-chunking/README.md b/LLMs/RAG/late-chunking/README.md new file mode 100644 index 0000000..eeabe45 --- /dev/null +++ b/LLMs/RAG/late-chunking/README.md @@ -0,0 +1,77 @@ +# Late Chunking of Short Chunks in Long-Context Embedding Models + +For many applications, encoding a whole text document into a single embedding representation is not useful. Many applications require retrieving smaller parts of the text and dense vector-based information retrieval systems often perform better with smaller text segments because of the limited information capacity of embedding vectors. + +![img.png](img/rag.png) + + +RAG (Retrieval Augmented Generations) is one of the best known applications to require splitting document collections into smaller text chunks. These chunks are typically stored in a vector database with vector representations created by a text embedding model. +At runtime, the same embedding model encodes a query text into a vector representation, which is used to identify relevant stored text chunks. These are them passed to a large language model (LLM) which synthesizes a response to the query based on the retrieved texts. + +## Context Problem + + +This simple RAG approach is not without challenges. Long distance contextual dependencies, i.e. when the relevant information is spread over multiple chunks and taking text segments out of context makes them useless, are particularly poorly handled by this approach. +![img.png](img/context-problem.png) +In the image above one can see an Wikipedia article that is split into chunks of sentences. +One can see that phrases like "its" and "the city" referencing "Berlin" which is mentioned only in the first sentence, e.g., it is harder for the embedding model to link it to the respective entity to produce a high-quality embedding representation. + + +For example, if we split a Wikipedia article into sentence-length segments, as in the example above, a RAG system might not be able to answer a query like "What is the population of Berlin?" The city name and the population never appear together in a single segment, and lacking any larger document context. +An LLM to which one of the segments is presented cannot resolve the anaphoric references like "it" or "the city". + +## Context-Sensitive Chunking + +To overcome this problem, we take advantage of the long input sequences that recent embedding models like [`jina-embeddings-v2-base-en`](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) can process. +These models support much longer input texts, for example, 8192 tokens for `jina-embeddings-v2-base-en` or roughly ten standard pages of text. Text segments of this size are much less likely to have contextual dependencies that can only be resolved with a larger context. +However, we still need vector representations of much smaller chunks of text, in part because of the limited input sizes of LLMs but primarily because of the limited information capacity of short embedding vectors. + +![img.png](img/method.png) + + +The simple encoding approach (as seen on the left side of the image above) chunks texts before processing them, using sentences, paragraphs, and maximum length limits to split text _a priori_, and then applying an embedding model to the resulting chunks. +Late Chunking, instead, first applies the transformer part from the embedding model to the entire text, or the largest part of it possible. This generates a sequence of vector representations for each token that encompass textual information from the entire text. +To generate a single embedding for a text, many embedding models apply _mean pooling_ to these token representations to output a single vector. Late Chunking instead applies mean pooling to smaller segments of this sequence of token vectors, producing embeddings for each chunk that take into account the entire text. + +## The Effect of Context-Sensitive Chunking + +This has immediately measurable concrete effects on retrieval. As an example, in case of "the city" and "Berlin" in a Wikipedia article, the vectors representing "the city" contain information connecting it to the previous mention of "Berlin", making it a much better match for queries involving that city name. + +You can see that in numerical results below, which compares the embedding of the string "Berlin" to various sentences from the article about Berlin. The column "Traditional Similarity" is the similarity values using _a priori_ chunking, and "Late Chunking Similarity" is with context-sensitive chunking. + +| Text | Similarity Traditional | Similarity Late Chunking | +|---------------------------------------------------------------------------------------------------------------------------------------|------------------------|-------------------------------| +| Berlin is the capital and largest city of Germany, both by area and by population." | 0.84862185 | 0.849546 | +| Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. | 0.7084338 | 0.82489026 | +| The city is also one of the states of Germany, and is the third smallest state in the country in terms of area. | 0.7534553 | 0.84980094 | + +As you can see the similarity scores for the first chunk that contains "Berlin" are very close to each other. +For the other two chunks they siginificantly differ, as the late chunking dramatically improves matching on sentences that do not explicitly use the word "Berlin" but have anaphoric references to it. + +## Evaluation on Retrieval Tasks + + +To verify the effectiveness of this approach beyond a few toy examples, we tested it with some of the retrieval benchmarks from [BeIR](https://github.com/beir-cellar/beir). +Those retrieval tasks consist of a query set, a corpus of text documents, and a QRels file that stores information about the IDs of documents that are relevant for each query. +To identify the relevant documents of a query, one can chunk the documents, encode them into an embedding index, and determine for each query embedding the most similar chunks (kNN). +As each chunk corresponds to a document, one can convert the kNN ranking of chunks into a kNN ranking of documents (for documents occurring multiple times in the ranking, only the first occurrence is retained). +After that, one can compare the resulting ranking with the ranking corresponding to the ground-truth QRels file and calculate retrieval metrics like nDCG@10. +We run this evaluation for various BeIR datasets with traditional chunking and our novel late chunking method. +To split texts into chunks, we choose a straightforward method, which chunks the tests into strings of 256 tokens. +Both the traditional and late chunking tests used the [jina-embeddings-v2-small-en](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) model. + +| Dataset | AVG Document Length (characters) | Traditional Chunking (nDCG@10) | Late Chunking (nDCG@10) | No Chunking (nDCG@10) | +|-----------|----------------------------------|--------------------------------|--------------------------------------|-----------------------| +| SciFact | 1498.4 | 64.20% | **66.10%** | 63.89% | +| TRECCOVID | 1116.7 | 63.36% | 64.70% | **65.18%** | +| FiQA2018 | 767.2 | 33.25% | **33.84%** | 33.43% | +| NFCorpus | 1589.8 | 23.46% | 29.98% | **30.40%** | +| Quora | 62.2 | 87.19% | 87.19% | 87.19% | + +In all cases, late chunking improved the score. In some cases, it also outperforms encoding the whole document into a single embedding, while for other datasets, no chunking performs best. However, this only makes sense if one does not need to rank chunks. One can also see that the average length of the documents correlates with greater improvement in the nDCG scores through late chunking. + +To reporoduce the evaluation, you can install the dependencies with `pip install .` and run the following script for the tasks "SciFactChunked", "TRECCOVIDChunked", "FiQA2018Chunked", "NFCorpusChunked", and "QuoraChunked": + +```bash +python3 run_chunked_eval.py --task-name {TASK_NAME} +``` diff --git a/LLMs/RAG/late-chunking/chunked_pooling/__init__.py b/LLMs/RAG/late-chunking/chunked_pooling/__init__.py new file mode 100644 index 0000000..e72500d --- /dev/null +++ b/LLMs/RAG/late-chunking/chunked_pooling/__init__.py @@ -0,0 +1,56 @@ +def chunk_by_sentences(input_text: str, tokenizer: callable): + """ + Split the input text into sentences using the tokenizer + :param input_text: The text snippet to split into sentences + :param tokenizer: The tokenizer to use + :return: A tuple containing the list of text chunks and their corresponding token spans + """ + inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True) + punctuation_mark_id = tokenizer.convert_tokens_to_ids('.') + sep_id = tokenizer.convert_tokens_to_ids('[SEP]') + token_offsets = inputs['offset_mapping'][0] + token_ids = inputs['input_ids'][0] + chunk_positions = [ + (i, int(start + 1)) + for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets)) + if token_id == punctuation_mark_id + and ( + token_offsets[i + 1][0] - token_offsets[i][1] > 0 + or token_ids[i + 1] == sep_id + ) + ] + chunks = [ + input_text[x[1] : y[1]] + for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions) + ] + span_annotations = [ + (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions) + ] + return chunks, span_annotations + + +def chunked_pooling( + model_output: 'BatchEncoding', span_annotation: list, max_length=None +): + token_embeddings = model_output[0] + outputs = [] + for embeddings, annotations in zip(token_embeddings, span_annotation): + if ( + max_length is not None + ): # remove annotations which go bejond the max-length of the model + annotations = [ + (start, min(end, max_length - 1)) + for (start, end) in annotations + if start < (max_length - 1) + ] + pooled_embeddings = [ + embeddings[start:end].sum(dim=0) / (end - start) + for start, end in annotations + if (end - start) >= 1 + ] + pooled_embeddings = [ + embedding.float().detach().cpu().numpy() for embedding in pooled_embeddings + ] + outputs.append(pooled_embeddings) + + return outputs diff --git a/LLMs/RAG/late-chunking/chunked_pooling/chunked_eval_tasks.py b/LLMs/RAG/late-chunking/chunked_pooling/chunked_eval_tasks.py new file mode 100644 index 0000000..8cd18fb --- /dev/null +++ b/LLMs/RAG/late-chunking/chunked_pooling/chunked_eval_tasks.py @@ -0,0 +1,616 @@ +import datasets +from mteb.abstasks.TaskMetadata import TaskMetadata + +from chunked_pooling.mteb_chunked_eval import AbsTaskChunkedRetrieval + + +class SciFactChunked(AbsTaskChunkedRetrieval): + metadata = TaskMetadata( + name='SciFactChunked', + dataset={ + 'path': 'mteb/scifact', + 'revision': '0228b52cf27578f30900b9e5271d331663a030d7', + 'name': 'SciFact', + }, + description=( + 'SciFact verifies scientific claims using evidence from the ' + 'research literature containing scientific paper abstracts.' + ), + reference='https://github.com/allenai/scifact', + type='Retrieval', + category='s2p', + eval_splits=['test'], + eval_langs=['eng-Latn'], + main_score='ndcg_at_10', + date=None, + form=None, + domains=None, + task_subtypes=None, + license=None, + socioeconomic_status=None, + annotations_creators=None, + dialect=None, + text_creation=None, + bibtex_citation=None, + n_samples=None, + avg_character_length=None, + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class NarrativeQAChunked(AbsTaskChunkedRetrieval): + metadata = TaskMetadata( + name='NarrativeQAChunked', + dataset={ + 'path': 'narrativeqa', + 'revision': '2e643e7363944af1c33a652d1c87320d0871c4e4', + 'name': 'NarrativeQARetrieval', + }, + reference='https://metatext.io/datasets/narrativeqa', + description=( + 'NarrativeQA is a dataset for the task of question answering ' + 'on long narratives. It consists of realistic QA instances ' + 'collected from literature (fiction and non-fiction) ' + 'and movie scripts. ' + ), + type='Retrieval', + category='s2p', + eval_splits=['test'], + eval_langs=['eng-Latn'], + main_score='ndcg_at_10', + date=None, + form=None, + domains=None, + task_subtypes=None, + license=None, + socioeconomic_status=None, + annotations_creators=None, + dialect=None, + text_creation=None, + bibtex_citation=None, + n_samples=None, + avg_character_length=None, + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class NFCorpusChunked(AbsTaskChunkedRetrieval): + metadata = TaskMetadata( + name="NFCorpusChunked", + dataset={ + "path": "mteb/nfcorpus", + "revision": "ec0fa4fe99da2ff19ca1214b7966684033a58814", + 'name': 'NFCorpus', + }, + description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval", + reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=None, + form=None, + domains=None, + task_subtypes=None, + license=None, + socioeconomic_status=None, + annotations_creators=None, + dialect=None, + text_creation=None, + bibtex_citation=None, + n_samples=None, + avg_character_length=None, + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class QuoraChunked(AbsTaskChunkedRetrieval): + metadata = TaskMetadata( + name="QuoraChunked", + dataset={ + "path": "mteb/quora", + "revision": "e4e08e0b7dbe3c8700f0daef558ff32256715259", + "name": "QuoraRetrieval", + }, + description=( + "QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a" + " question, find other (duplicate) questions." + ), + reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs", + type="Retrieval", + category="s2s", + eval_splits=["dev", "test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=None, + form=None, + domains=None, + task_subtypes=None, + license=None, + socioeconomic_status=None, + annotations_creators=None, + dialect=None, + text_creation=None, + bibtex_citation=None, + n_samples=None, + avg_character_length=None, + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class FiQA2018Chunked(AbsTaskChunkedRetrieval): + metadata = TaskMetadata( + name="FiQA2018Chunked", + description="Financial Opinion Mining and Question Answering", + reference="https://sites.google.com/view/fiqa/", + dataset={ + "path": "mteb/fiqa", + "revision": "27a168819829fe9bcd655c2df245fb19452e8e06", + 'name': 'FiQA2018', + }, + type="Retrieval", + category="s2p", + eval_splits=["train", "dev", "test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=None, + form=None, + domains=None, + task_subtypes=None, + license=None, + socioeconomic_status=None, + annotations_creators=None, + dialect=None, + text_creation=None, + bibtex_citation=None, + n_samples=None, + avg_character_length=None, + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class TRECCOVIDChunked(AbsTaskChunkedRetrieval): + metadata = TaskMetadata( + name='TRECCOVIDChunked', + description=( + 'TRECCOVID is an ad-hoc search challenge based on the ' + 'COVID-19 dataset containing scientific articles ' + 'related to the COVID-19 pandemic.' + ), + reference='https://ir.nist.gov/covidSubmit/index.html', + dataset={ + 'path': 'mteb/trec-covid', + 'revision': 'bb9466bac8153a0349341eb1b22e06409e78ef4e', + 'name': 'TRECCOVID', + }, + type='Retrieval', + category='s2p', + eval_splits=['test'], + eval_langs=['eng-Latn'], + main_score='ndcg_at_10', + date=None, + form=None, + domains=None, + task_subtypes=None, + license=None, + socioeconomic_status=None, + annotations_creators=None, + dialect=None, + text_creation=None, + bibtex_citation=None, + n_samples=None, + avg_character_length=None, + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class LEMBWikimQARetrievalChunked(AbsTaskChunkedRetrieval): + """ + modified from https://github.com/embeddings-benchmark/mteb/blob/main/mteb/tasks/Retrieval/eng/LEMBWikimQARetrieval.py + """ + + _EVAL_SPLIT = "test" + + metadata = TaskMetadata( + name="LEMBWikimQARetrievalChunked", + dataset={ + "path": "dwzhu/LongEmbed", + "revision": "10039a580487dacecf79db69166e17ace3ede392", + "name": "LEMBWikimQARetrieval", + }, + reference="https://huggingface.co/datasets/dwzhu/LongEmbed", + description=("2wikimqa subset of dwzhu/LongEmbed dataset."), + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("1950-01-01", "2019-12-31"), + domains=None, + socioeconomic_status=None, + n_samples=None, + avg_character_length=None, + form=None, + text_creation=None, + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @inproceedings{ho2020constructing, + title={Constructing A Multi-hop QA Dataset for Comprehensive Evaluation of Reasoning Steps}, + author={Ho, Xanh and Nguyen, Anh-Khoa Duong and Sugawara, Saku and Aizawa, Akiko}, + booktitle={Proceedings of the 28th International Conference on Computational Linguistics}, + pages={6609--6625}, + year={2020} + } + """, + descriptive_stats={ + "n_samples": {_EVAL_SPLIT: 500}, + "avg_character_length": { + "test": { + "average_document_length": 37445.60333333333, + "average_query_length": 67.57, + "num_documents": 300, + "num_queries": 300, + "average_relevant_docs_per_query": 1.0, + } + }, + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + dataset_dict = {**self.metadata.dataset} + dataset_dict['name'] = '2wikimqa' + + query_list = datasets.load_dataset(**dataset_dict)["queries"] + queries = {row["qid"]: row["text"] for row in query_list} + + corpus_list = datasets.load_dataset(**dataset_dict)["corpus"] + corpus = {row["doc_id"]: {"text": row["text"]} for row in corpus_list} + + qrels_list = datasets.load_dataset(**dataset_dict)["qrels"] + qrels = {row["qid"]: {row["doc_id"]: 1} for row in qrels_list} + + self.corpus = {self._EVAL_SPLIT: corpus} + self.queries = {self._EVAL_SPLIT: queries} + self.relevant_docs = {self._EVAL_SPLIT: qrels} + + self.data_loaded = True + + +class LEMBSummScreenFDRetrievalChunked(AbsTaskChunkedRetrieval): + """ + modified from https://github.com/embeddings-benchmark/mteb/blob/main/mteb/tasks/Retrieval/eng/LEMBWikimQARetrieval.py + """ + + _EVAL_SPLIT = "test" + + metadata = TaskMetadata( + name="LEMBSummScreenFDRetrievalChunked", + dataset={ + "path": "dwzhu/LongEmbed", + "revision": "10039a580487dacecf79db69166e17ace3ede392", + "name": "LEMBSummScreenFDRetrieval", + }, + reference="https://huggingface.co/datasets/dwzhu/LongEmbed", + description=("summ_screen_fd subset of dwzhu/LongEmbed dataset."), + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("1950-01-01", "2019-12-31"), + domains=None, + socioeconomic_status=None, + n_samples=None, + avg_character_length=None, + form=None, + text_creation=None, + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @inproceedings{ho2020constructing, + title={Constructing A Multi-hop QA Dataset for Comprehensive Evaluation of Reasoning Steps}, + author={Ho, Xanh and Nguyen, Anh-Khoa Duong and Sugawara, Saku and Aizawa, Akiko}, + booktitle={Proceedings of the 28th International Conference on Computational Linguistics}, + pages={6609--6625}, + year={2020} + } + """, + descriptive_stats={ + "n_samples": {_EVAL_SPLIT: 500}, + "avg_character_length": { + "test": { + "average_document_length": 30854.327, + "average_query_length": 591.49, + "num_documents": 300, + "num_queries": 300, + "average_relevant_docs_per_query": 1.0, + } + }, + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + dataset_dict = {**self.metadata.dataset} + dataset_dict['name'] = 'summ_screen_fd' + + query_list = datasets.load_dataset(**dataset_dict)["queries"] + queries = {row["qid"]: row["text"] for row in query_list} + + corpus_list = datasets.load_dataset(**dataset_dict)["corpus"] + corpus = {row["doc_id"]: {"text": row["text"]} for row in corpus_list} + + qrels_list = datasets.load_dataset(**dataset_dict)["qrels"] + qrels = {row["qid"]: {row["doc_id"]: 1} for row in qrels_list} + + self.corpus = {self._EVAL_SPLIT: corpus} + self.queries = {self._EVAL_SPLIT: queries} + self.relevant_docs = {self._EVAL_SPLIT: qrels} + + self.data_loaded = True + + +class LEMBQMSumRetrievalChunked(AbsTaskChunkedRetrieval): + """ + modified from https://github.com/embeddings-benchmark/mteb/blob/main/mteb/tasks/Retrieval/eng/LEMBWikimQARetrieval.py + """ + + _EVAL_SPLIT = "test" + + metadata = TaskMetadata( + name="LEMBQMSumRetrievalChunked", + dataset={ + "path": "dwzhu/LongEmbed", + "revision": "10039a580487dacecf79db69166e17ace3ede392", + "name": "LEMBQMSumRetrieval", + }, + reference="https://huggingface.co/datasets/dwzhu/LongEmbed", + description=("qmsum subset of dwzhu/LongEmbed dataset."), + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("1950-01-01", "2019-12-31"), + domains=None, + socioeconomic_status=None, + n_samples=None, + avg_character_length=None, + form=None, + text_creation=None, + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @inproceedings{ho2020constructing, + title={Constructing A Multi-hop QA Dataset for Comprehensive Evaluation of Reasoning Steps}, + author={Ho, Xanh and Nguyen, Anh-Khoa Duong and Sugawara, Saku and Aizawa, Akiko}, + booktitle={Proceedings of the 28th International Conference on Computational Linguistics}, + pages={6609--6625}, + year={2020} + } + """, + descriptive_stats={ + "n_samples": {_EVAL_SPLIT: 500}, + "avg_character_length": { + "test": { + "average_document_length": 53335.817, + "average_query_length": 433.50, + "num_documents": 300, + "num_queries": 300, + "average_relevant_docs_per_query": 1.0, + } + }, + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + dataset_dict = {**self.metadata.dataset} + dataset_dict['name'] = 'qmsum' + + query_list = datasets.load_dataset(**dataset_dict)["queries"] + queries = {row["qid"]: row["text"] for row in query_list} + + corpus_list = datasets.load_dataset(**dataset_dict)["corpus"] + corpus = {row["doc_id"]: {"text": row["text"]} for row in corpus_list} + + qrels_list = datasets.load_dataset(**dataset_dict)["qrels"] + qrels = {row["qid"]: {row["doc_id"]: 1} for row in qrels_list} + + self.corpus = {self._EVAL_SPLIT: corpus} + self.queries = {self._EVAL_SPLIT: queries} + self.relevant_docs = {self._EVAL_SPLIT: qrels} + + self.data_loaded = True + + +class LEMBNeedleRetrievalChunked(AbsTaskChunkedRetrieval): + """ + modified from https://github.com/embeddings-benchmark/mteb/blob/main/mteb/tasks/Retrieval/eng/LEMBNeedleRetrieval.py + """ + + _EVAL_SPLIT = [ + "test_256", + "test_512", + "test_1024", + "test_2048", + "test_4096", + "test_8192", + "test_16384", + "test_32768", + ] + + metadata = TaskMetadata( + name="LEMBNeedleRetrievalChunked", + dataset={ + "path": "dwzhu/LongEmbed", + "revision": "6e346642246bfb4928c560ee08640dc84d074e8c", + "name": "needle", + }, + reference="https://huggingface.co/datasets/dwzhu/LongEmbed", + description=("needle subset of dwzhu/LongEmbed dataset."), + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=_EVAL_SPLIT, + eval_langs=["eng-Latn"], + main_score="ndcg_at_1", + date=("2000-01-01", "2023-12-31"), + domains=["Academic", "Blog", "Written"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @article{zhu2024longembed, + title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, + author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, + journal={arXiv preprint arXiv:2404.12096}, + year={2024} + } + """, + descriptive_stats={ + "n_samples": { + "test_256": 150, + "test_512": 150, + "test_1024": 150, + "test_2048": 150, + "test_4096": 150, + "test_8192": 150, + "test_16384": 150, + "test_32768": 150, + }, + "avg_character_length": { + "test_256": { + "average_document_length": 1013.22, + "average_query_length": 60.48, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + "test_512": { + "average_document_length": 2009.96, + "average_query_length": 57.3, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + "test_1024": { + "average_document_length": 4069.9, + "average_query_length": 58.28, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + "test_2048": { + "average_document_length": 8453.82, + "average_query_length": 59.92, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + "test_4096": { + "average_document_length": 17395.8, + "average_query_length": 55.86, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + "test_8192": { + "average_document_length": 35203.82, + "average_query_length": 59.6, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + "test_16384": { + "average_document_length": 72054.8, + "average_query_length": 59.12, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + "test_32768": { + "average_document_length": 141769.8, + "average_query_length": 58.34, + "num_documents": 100, + "num_queries": 50, + "average_relevant_docs_per_query": 1.0, + }, + }, + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = {} + self.queries = {} + self.relevant_docs = {} + + for split in self._EVAL_SPLIT: + context_length = int(split.split("_")[1]) + query_list = datasets.load_dataset(**self.metadata_dict["dataset"])[ + "queries" + ] # dict_keys(['qid', 'text']) + query_list = query_list.filter( + lambda x: x["context_length"] == context_length + ) + queries = {row["qid"]: row["text"] for row in query_list} + + corpus_list = datasets.load_dataset(**self.metadata_dict["dataset"])[ + "corpus" + ] # dict_keys(['doc_id', 'text']) + corpus_list = corpus_list.filter( + lambda x: x["context_length"] == context_length + ) + corpus = {row["doc_id"]: {"text": row["text"]} for row in corpus_list} + + qrels_list = datasets.load_dataset(**self.metadata_dict["dataset"])[ + "qrels" + ] # dict_keys(['qid', 'doc_id']) + qrels_list = qrels_list.filter( + lambda x: x["context_length"] == context_length + ) + qrels = {row["qid"]: {row["doc_id"]: 1} for row in qrels_list} + + self.corpus[split] = corpus + self.queries[split] = queries + self.relevant_docs[split] = qrels + + self.data_loaded = True diff --git a/LLMs/RAG/late-chunking/chunked_pooling/chunking.py b/LLMs/RAG/late-chunking/chunked_pooling/chunking.py new file mode 100644 index 0000000..4585aa7 --- /dev/null +++ b/LLMs/RAG/late-chunking/chunked_pooling/chunking.py @@ -0,0 +1,159 @@ +import bisect +import logging +from typing import Dict, List, Optional, Tuple, Union + +from llama_index.core.node_parser import SemanticSplitterNodeParser +from llama_index.core.schema import Document +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from transformers import AutoTokenizer + +# Set the logging level to WARNING to suppress INFO and DEBUG messages +logging.getLogger('sentence_transformers').setLevel(logging.WARNING) + +CHUNKING_STRATEGIES = ['semantic', 'fixed', 'sentences'] + + +class Chunker: + def __init__( + self, + chunking_strategy: str, + ): + if chunking_strategy not in CHUNKING_STRATEGIES: + raise ValueError("Unsupported chunking strategy: ", chunking_strategy) + self.chunking_strategy = chunking_strategy + self.embed_model = None + self.embedding_model_name = None + + def _setup_semantic_chunking(self, embedding_model_name): + if embedding_model_name: + self.embedding_model_name = embedding_model_name + + self.embed_model = HuggingFaceEmbedding( + model_name=self.embedding_model_name, + trust_remote_code=True, + embed_batch_size=1, + ) + self.splitter = SemanticSplitterNodeParser( + embed_model=self.embed_model, + show_progress=False, + ) + + def chunk_semantically( + self, + text: str, + tokenizer: 'AutoTokenizer', + embedding_model_name: Optional[str] = None, + ) -> List[Tuple[int, int]]: + if self.embed_model is None: + self._setup_semantic_chunking(embedding_model_name) + + # Get semantic nodes + nodes = [ + (node.start_char_idx, node.end_char_idx) + for node in self.splitter.get_nodes_from_documents( + [Document(text=text)], show_progress=False + ) + ] + + # Tokenize the entire text + tokens = tokenizer.encode_plus( + text, + return_offsets_mapping=True, + add_special_tokens=False, + padding=True, + truncation=True, + ) + token_offsets = tokens.offset_mapping + + chunk_spans = [] + + for char_start, char_end in nodes: + # Convert char indices to token indices + start_chunk_index = bisect.bisect_left( + [offset[0] for offset in token_offsets], char_start + ) + end_chunk_index = bisect.bisect_right( + [offset[1] for offset in token_offsets], char_end + ) + + # Add the chunk span if it's within the tokenized text + if start_chunk_index < len(token_offsets) and end_chunk_index <= len( + token_offsets + ): + chunk_spans.append((start_chunk_index, end_chunk_index)) + else: + break + + return chunk_spans + + def chunk_by_tokens( + self, + text: str, + chunk_size: int, + tokenizer: 'AutoTokenizer', + ) -> List[Tuple[int, int, int]]: + tokens = tokenizer.encode_plus( + text, return_offsets_mapping=True, add_special_tokens=False + ) + token_offsets = tokens.offset_mapping + + chunk_spans = [] + for i in range(0, len(token_offsets), chunk_size): + chunk_end = min(i + chunk_size, len(token_offsets)) + if chunk_end - i > 0: + chunk_spans.append((i, chunk_end)) + + return chunk_spans + + def chunk_by_sentences( + self, + text: str, + n_sentences: int, + tokenizer: 'AutoTokenizer', + ) -> List[Tuple[int, int, int]]: + tokens = tokenizer.encode_plus( + text, return_offsets_mapping=True, add_special_tokens=False + ) + token_offsets = tokens.offset_mapping + + chunk_spans = [] + chunk_start = 0 + count_chunks = 0 + for i in range(0, len(token_offsets)): + if tokens.tokens(0)[i] in ('.', '!', '?') and ( + (len(tokens.tokens(0)) == i + 1) + or (tokens.token_to_chars(i).end != tokens.token_to_chars(i + 1).start) + ): + count_chunks += 1 + if count_chunks == n_sentences: + chunk_spans.append((chunk_start, i + 1)) + chunk_start = i + 1 + count_chunks = 0 + if len(tokens.tokens(0)) - chunk_start > 1: + chunk_spans.append((chunk_start, len(tokens.tokens(0)))) + return chunk_spans + + def chunk( + self, + text: str, + tokenizer: 'AutoTokenizer', + chunking_strategy: str = None, + chunk_size: Optional[int] = None, + n_sentences: Optional[int] = None, + embedding_model_name: Optional[str] = None, + ): + chunking_strategy = chunking_strategy or self.chunking_strategy + if chunking_strategy == "semantic": + return self.chunk_semantically( + text, + embedding_model_name=embedding_model_name, + tokenizer=tokenizer, + ) + elif chunking_strategy == "fixed": + if chunk_size < 4: + raise ValueError("Chunk size must be >= 4.") + return self.chunk_by_tokens(text, chunk_size, tokenizer) + elif chunking_strategy == "sentences": + return self.chunk_by_sentences(text, n_sentences, tokenizer) + else: + raise ValueError("Unsupported chunking strategy") diff --git a/LLMs/RAG/late-chunking/chunked_pooling/mteb_chunked_eval.py b/LLMs/RAG/late-chunking/chunked_pooling/mteb_chunked_eval.py new file mode 100644 index 0000000..df5beb7 --- /dev/null +++ b/LLMs/RAG/late-chunking/chunked_pooling/mteb_chunked_eval.py @@ -0,0 +1,441 @@ +import logging +from typing import Any, Optional + +import numpy as np +import torch +from mteb.abstasks import AbsTask +from mteb.evaluation.evaluators import RetrievalEvaluator +from mteb.load_results.mteb_results import ScoresDict +from mteb.tasks import Retrieval +from tqdm import tqdm + +from chunked_pooling import chunked_pooling +from chunked_pooling.chunking import Chunker + +logger = logging.getLogger(__name__) + + +class AbsTaskChunkedRetrieval(AbsTask): + def __init__( + self, + chunking_strategy: str = None, + chunked_pooling_enabled: bool = False, + tokenizer: Optional[Any] = None, + prune_size: Optional[int] = None, + chunk_size: Optional[int] = None, + n_sentences: Optional[int] = None, + model_has_instructions: bool = False, + embedding_model_name: Optional[str] = None, # for semantic chunking + truncate_max_length: Optional[int] = 8192, + long_late_chunking_embed_size: Optional[int] = 0, + long_late_chunking_overlap_size: Optional[int] = 512, + **kwargs, + ): + super().__init__(**kwargs) + try: + self.retrieval_task = getattr( + Retrieval, + self.metadata_dict['dataset'].get('name', None) + or self.metadata_dict.get('name'), + )() + except: + logger.warning('Could not initialize retrieval_task') + self.chunking_strategy = chunking_strategy + self.chunker = Chunker(self.chunking_strategy) + self.chunked_pooling_enabled = chunked_pooling_enabled + self.tokenizer = tokenizer + self.prune_size = prune_size + self.model_has_instructions = model_has_instructions + self.chunking_args = { + 'chunk_size': chunk_size, + 'n_sentences': n_sentences, + 'embedding_model_name': embedding_model_name, + } + self.truncate_max_length = ( + truncate_max_length if truncate_max_length > 0 else None + ) + + self.long_late_chunking_embed_size = long_late_chunking_embed_size + self.long_late_chunking_overlap_size = long_late_chunking_overlap_size + + def load_data(self, **kwargs): + self.retrieval_task.load_data(**kwargs) + self.corpus = self.retrieval_task.corpus + self.queries = self.retrieval_task.queries + self.relevant_docs = self.retrieval_task.relevant_docs + # prune dataset + if self.prune_size: + self.queries, self.corpus, self.relevant_docs = self._prune( + self.queries, self.corpus, self.relevant_docs, self.prune_size + ) + + def calculate_metadata_metrics(self): + self.retrieval_task.calculate_metadata_metrics() + + def evaluate( + self, model, split: str = "test", encode_kwargs: dict[str, Any] = {}, **kwargs + ) -> dict[str, ScoresDict]: + scores: dict[str, ScoresDict] = {} + hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + + for hf_subset in hf_subsets: + logger.info(f"Subset: {hf_subset}") + + if hf_subset == "default": + corpus, queries, relevant_docs = ( + self.corpus[split], + self.queries[split], + self.relevant_docs[split], + ) + else: + corpus, queries, relevant_docs = ( + self.corpus[hf_subset][split], + self.queries[hf_subset][split], + self.relevant_docs[hf_subset][split], + ) + + scores[hf_subset] = self._evaluate_monolingual( + model, + corpus, + queries, + relevant_docs, + hf_subset, + encode_kwargs=encode_kwargs, + **kwargs, + ) + + return scores + + def _truncate_documents(self, corpus): + for k, v in corpus.items(): + title_tokens = 0 + if 'title' in v: + tokens = self.tokenizer( + v['title'] + ' ', + return_offsets_mapping=True, + max_length=self.truncate_max_length, + ) + title_tokens = len(tokens.input_ids) + tokens = self.tokenizer( + v['text'], + return_offsets_mapping=True, + max_length=self.truncate_max_length - title_tokens, + ) + last_token_span = tokens.offset_mapping[-2] + v['text'] = v['text'][: last_token_span[1]] + return corpus + + def _embed_with_overlap(self, model, model_inputs): + len_tokens = len(model_inputs["input_ids"][0]) + + if len_tokens > self.long_late_chunking_embed_size: + indices = [] + for i in range( + 0, + len_tokens, + self.long_late_chunking_embed_size + - self.long_late_chunking_overlap_size, + ): + start = i + end = min(i + self.long_late_chunking_embed_size, len_tokens) + indices.append((start, end)) + else: + indices = [(0, len_tokens)] + + outputs = [] + for start, end in indices: + batch_inputs = {k: v[:, start:end] for k, v in model_inputs.items()} + + with torch.no_grad(): + model_output = model(**batch_inputs) + + if start > 0: + outputs.append( + model_output[0][:, self.long_late_chunking_overlap_size :] + ) + else: + outputs.append(model_output[0]) + + return torch.cat(outputs, dim=1).to(model.device) + + def _evaluate_monolingual( + self, + model, + corpus, + queries, + relevant_docs, + lang=None, + batch_size=1, + encode_kwargs=None, + **kwargs, + ): + if self.truncate_max_length: + corpus = self._truncate_documents(corpus) + # split corpus into chunks + if not self.chunked_pooling_enabled: + corpus = self._apply_chunking(corpus, self.tokenizer) + max_chunks = max([len(x) for x in corpus.values()]) + corpus = self._flatten_chunks(corpus) + k_values = self._calculate_k_values(max_chunks) + # determine the maximum number of documents to consider in a ranking + max_k = int(max(k_values) / max_chunks) + retriever = RetrievalEvaluator( + model, + k_values=k_values, + encode_kwargs=(encode_kwargs or dict()), + **kwargs, + ) + results = retriever(corpus, queries) + else: + query_ids = list(queries.keys()) + query_texts = [queries[k] for k in query_ids] + if hasattr(model, 'encode_queries'): + query_embs = model.encode_queries(query_texts) + else: + query_embs = model.encode(query_texts) + + corpus_ids = list(corpus.keys()) + corpus_texts = [ + ( + f"{corpus[k]['title']} {corpus[k]['text']}" + if 'title' in corpus[k] + else corpus[k]['text'] + ) + for k in corpus_ids + ] + + chunk_annotations = self._calculate_annotations(model, corpus_texts) + + corpus_embs = [] + with torch.no_grad(): + for inputs in tqdm( + self._batch_inputs( + list(zip(corpus_texts, chunk_annotations)), + batch_size=batch_size, + ), + total=(len(corpus_texts) // batch_size), + ): + if self.model_has_instructions: + instr = model.get_instructions()[1] + else: + instr = '' + text_inputs = [instr + x[0] for x in inputs] + annotations = [x[1] for x in inputs] + model_inputs = self.tokenizer( + text_inputs, + return_tensors='pt', + padding=True, + truncation=self.truncate_max_length is not None, + max_length=self.truncate_max_length, + ) + if model.device.type == 'cuda': + model_inputs = { + k: v.to(model.device) for k, v in model_inputs.items() + } + + if self.long_late_chunking_embed_size > 0: + model_outputs = self._embed_with_overlap(model, model_inputs) + output_embs = chunked_pooling( + [model_outputs], annotations, max_length=None + ) + else: # truncation + model_outputs = model(**model_inputs) + output_embs = chunked_pooling( + model_outputs, + annotations, + max_length=self.truncate_max_length, + ) + corpus_embs.extend(output_embs) + + max_chunks = max([len(x) for x in corpus_embs]) + k_values = self._calculate_k_values(max_chunks) + # determine the maximum number of documents to consider in a ranking + max_k = int(max(k_values) / max_chunks) + ( + chunk_id_list, + doc_to_chunk, + flattened_corpus_embs, + ) = self.flatten_corpus_embs(corpus_embs, corpus_ids) + similarity_matrix = np.dot(query_embs, flattened_corpus_embs.T) + results = self.get_results( + chunk_id_list, k_values, query_ids, similarity_matrix + ) + + doc_results = self.get_doc_results(results) + + ndcg, _map, recall, precision, _ = RetrievalEvaluator.evaluate( + relevant_docs, + doc_results, + [k for k in k_values if k <= max_k], + ignore_identical_ids=kwargs.get('ignore_identical_ids', True), + ) + mrr, _ = RetrievalEvaluator.evaluate_custom( + relevant_docs, + doc_results, + [k for k in k_values if k <= max_k], + 'mrr', + ) + scores = { + **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, + **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, + **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, + **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, + **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()}, + } + self._add_main_score(scores) + return scores + + def _add_main_score(self, scores: ScoresDict) -> None: + scores["main_score"] = scores[self.metadata.main_score] + + def get_results(self, chunk_id_list, k_values, query_ids, similarity_matrix): + results = {} + for i, query_id in enumerate(query_ids): + query_results = {} + for idx, score in enumerate(similarity_matrix[i]): + chunk_id = chunk_id_list[idx] + query_results[chunk_id] = score + # Sort results by score and only keep the top k scores + sorted_query_results = dict( + sorted(query_results.items(), key=lambda item: item[1], reverse=True)[ + : max(k_values) + ] + ) + results[query_id] = sorted_query_results + return results + + def flatten_corpus_embs(self, corpus_embs, corpus_ids): + doc_to_chunk = {} + flattened_corpus_embs = [] + chunk_id_list = [] + for doc_id, emb in zip(corpus_ids, corpus_embs): + for i, chunk in enumerate(emb): + flattened_corpus_embs.append(chunk) + doc_to_chunk[f"{doc_id}~{i}"] = doc_id + chunk_id_list.append(f"{doc_id}~{i}") + flattened_corpus_embs = np.vstack(flattened_corpus_embs) + flattened_corpus_embs = self._normalize(flattened_corpus_embs) + return chunk_id_list, doc_to_chunk, flattened_corpus_embs + + @staticmethod + def get_doc_results(results): + doc_results = dict() + for q, result_chunks in results.items(): + docs = dict() + for c_id, score in result_chunks.items(): + d_id = '~'.join(c_id.split('~')[:-1]) + if (d_id not in docs) or (score > docs[d_id]): + docs[d_id] = float(score) + doc_results[q] = docs + return doc_results + + def _calculate_k_values(self, max_chunks): + k_values = [1, 3, 5, 10, 20] + n = 2 + while 10**n < 100 * max_chunks: + k_values.append(10**n) + n += 1 + return k_values + + def _apply_chunking(self, corpus, tokenizer): + chunked_corpus = dict() + for k, v in corpus.items(): + text = f"{v['title']} {v['text']}" if 'title' in v else v['text'] + current_doc = [] + chunk_annotations = self.chunker.chunk( + text, + tokenizer, + chunking_strategy=self.chunking_strategy, + **self.chunking_args, + ) + tokens = tokenizer.encode_plus(text, add_special_tokens=False) + for start_token_idx, end_token_idx in chunk_annotations: + text_chunk = tokenizer.decode( + tokens.encodings[0].ids[start_token_idx:end_token_idx] + ) + current_doc.append({'text': text_chunk}) + chunked_corpus[k] = current_doc + return chunked_corpus + + def _calculate_annotations(self, model, corpus_texts): + if self.model_has_instructions: + instr = model.get_instructions()[1] + instr_tokens = self.tokenizer(instr, add_special_tokens=False) + n_instruction_tokens = len(instr_tokens[0]) + else: + n_instruction_tokens = 0 + chunk_annotations = [ + self._extend_special_tokens( + self.chunker.chunk( + text, + self.tokenizer, + chunking_strategy=self.chunking_strategy, + **self.chunking_args, + ), + n_instruction_tokens=n_instruction_tokens, + ) + for text in corpus_texts + ] + return chunk_annotations + + @staticmethod + def _flatten_chunks(chunked_corpus): + flattened_corpus = dict() + for k, li in chunked_corpus.items(): + for i, c in enumerate(li): + flattened_corpus[f'{k}~{i}'] = c + + return flattened_corpus + + @staticmethod + def _normalize(x): + return x / np.linalg.norm(x, axis=1)[:, None] + + @staticmethod + def _batch_inputs(li, batch_size): + for i in range(0, len(li), batch_size): + yield li[i : i + batch_size] + + @staticmethod + def _extend_special_tokens( + annotations, n_instruction_tokens=0, include_prefix=True, include_sep=True + ): + """Extends the spans because of additional special tokens, e.g. the CLS token + which are not considered by the chunker. + """ + new_annotations = [] + for i in range(len(annotations)): + add_left_offset = 1 if (not include_prefix) or int(i > 0) else 0 + left_offset = 1 + n_instruction_tokens + left = ( + annotations[i][0] + add_left_offset * left_offset + ) # move everything by one for [CLS] + + add_sep = 1 if include_sep and ((i + 1) == len(annotations)) else 0 + right_offset = left_offset + add_sep + right = ( + annotations[i][1] + right_offset + ) # move everything by one for [CLS] and the last one for [SEP] + + new_annotations.append((left, right)) + return new_annotations + + @staticmethod + def _prune(queries, corpus, relevant_docs, prune_size): + new_queries = {'test': {}} + new_corpus = {'test': {}} + new_relevant_docs = {'test': {}} + for i, key in enumerate(relevant_docs['test']): + if i >= prune_size: + break + new_relevant_docs['test'][key] = relevant_docs['test'][key] + for x in relevant_docs['test'][key]: + new_corpus['test'][x] = corpus['test'][x] + new_queries['test'][key] = queries['test'][key] + return new_queries, new_corpus, new_relevant_docs + + def _calculate_metrics_from_split(*args, **kwargs): + pass + + def _evaluate_subset(*args, **kwargs): + pass diff --git a/LLMs/RAG/late-chunking/chunked_pooling/wrappers.py b/LLMs/RAG/late-chunking/chunked_pooling/wrappers.py new file mode 100644 index 0000000..e1ec46f --- /dev/null +++ b/LLMs/RAG/late-chunking/chunked_pooling/wrappers.py @@ -0,0 +1,167 @@ +import os +from typing import List, Optional, Union + +import torch +import torch.nn as nn +from sentence_transformers import SentenceTransformer +from transformers import AutoModel +from transformers.modeling_outputs import BaseModelOutputWithPooling + + +def construct_document(doc): + if isinstance(doc, str): + return doc + elif 'title' in doc: + return f'{doc["title"]} {doc["text"].strip()}' + else: + return doc['text'].strip() + + +class JinaEmbeddingsV3Wrapper(nn.Module): + def __init__( + self, model_name, tasks=['retrieval.query', 'retrieval.passage'], **model_kwargs + ): + super().__init__() + self._model = AutoModel.from_pretrained( + model_name, trust_remote_code=True, **model_kwargs + ) + self.tasks = tasks + + def encode_queries( + self, + sentences: Union[str, List[str]], + *args, + task: Optional[str] = None, + **kwargs, + ): + return self._model.encode(sentences, *args, task=self.tasks[0], **kwargs) + + def encode_corpus( + self, + sentences: Union[str, List[str]], + *args, + **kwargs, + ): + _sentences = [construct_document(sentence) for sentence in sentences] + return self._model.encode(_sentences, *args, task=self.tasks[1], **kwargs) + + def get_instructions(self): + return [self._model._task_instructions[x] for x in self.tasks] + + def forward(self, *args, **kwargs): + task_id = self._model._adaptation_map[self.tasks[1]] + num_examples = kwargs['input_ids'].shape[0] + adapter_mask = torch.full( + (num_examples,), task_id, dtype=torch.int32, device=self._model.device + ) + return self._model.forward(*args, adapter_mask=adapter_mask, **kwargs) + + @property + def device(self): + return self._model.device + + @staticmethod + def has_instructions(): + return True + + +class NomicAIWrapper(nn.Module): + def __init__(self, model_name, **model_kwargs): + super().__init__() + self._model = SentenceTransformer( + model_name, trust_remote_code=True, **model_kwargs + ) + self.instructions = ['search_query: ', 'search_document: '] + + def get_instructions(self): + return self.instructions + + def forward(self, *args, **kwargs): + model_output = self._model.forward(kwargs) + base_model_output = BaseModelOutputWithPooling( + last_hidden_state=model_output['token_embeddings'], + pooler_output=model_output['sentence_embedding'], + attentions=model_output['attention_mask'], + ) + return base_model_output + + def encode_queries( + self, + sentences: Union[str, List[str]], + *args, + **kwargs, + ): + return self._model.encode( + [self.instructions[0] + s for s in sentences], *args, **kwargs + ) + + def encode_corpus( + self, + sentences: Union[str, List[str]], + *args, + **kwargs, + ): + return self._model.encode( + [self.instructions[1] + construct_document(s) for s in sentences], + *args, + **kwargs, + ) + + @property + def device(self): + return self._model.device + + @staticmethod + def has_instructions(): + return True + + +MODEL_WRAPPERS = { + 'jinaai/jina-embeddings-v3': JinaEmbeddingsV3Wrapper, + 'sentence-transformers/all-MiniLM-L6-v2': SentenceTransformer, + 'nomic-ai/nomic-embed-text-v1': NomicAIWrapper, +} + +MODELS_WITHOUT_PROMPT_NAME_ARG = [ + 'jinaai/jina-embeddings-v2-small-en', + 'jinaai/jina-embeddings-v2-base-en', + 'jinaai/jina-embeddings-v3', +] + + +def remove_unsupported_kwargs(original_encode): + def wrapper(self, *args, **kwargs): + # Remove 'prompt_name' from kwargs if present + kwargs.pop('prompt_name', None) + kwargs.pop('request_qid', None) + return original_encode(self, *args, **kwargs) + + return wrapper + + +def load_model(model_name, model_weights=None, **model_kwargs): + if model_name in MODEL_WRAPPERS: + model = MODEL_WRAPPERS[model_name](model_name, **model_kwargs) + if hasattr(MODEL_WRAPPERS[model_name], 'has_instructions'): + has_instructions = MODEL_WRAPPERS[model_name].has_instructions() + else: + has_instructions = False + else: + model = AutoModel.from_pretrained(model_name, trust_remote_code=True) + has_instructions = False + + if model_weights and os.path.exists(model_weights): + model._model.load_state_dict(torch.load(model_weights, device=model.device)) + + # encode functions of various models do not support all sentence transformers kwargs parameter + if model_name in MODELS_WITHOUT_PROMPT_NAME_ARG: + ENCODE_FUNC_NAMES = ['encode', 'encode_queries', 'encode_corpus'] + for func_name in ENCODE_FUNC_NAMES: + if hasattr(model, func_name): + setattr( + model, + func_name, + remove_unsupported_kwargs(getattr(model, func_name)), + ) + + return model, has_instructions diff --git a/LLMs/RAG/late-chunking/examples.ipynb b/LLMs/RAG/late-chunking/examples.ipynb new file mode 100644 index 0000000..78923d1 --- /dev/null +++ b/LLMs/RAG/late-chunking/examples.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e1173893c4f0ea56", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# Chunked Pooling\n", + "This notebooks explains how the chunked pooling can be implemented. First you need to install the requirements: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d02a920f-cde0-4035-9834-49b087aab5cc", + "metadata": { + "is_executing": true + }, + "outputs": [], + "source": [ + "!pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "58a8fbc1e477db48", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Then we load a model which we want to use for the embedding. We choose `jinaai/jina-embeddings-v2-base-en` but any other model which supports mean pooling is possible. However, models with a large maximum context-length are preferred." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1380abf7acde9517", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/michael/workspace/chunked-pooling/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from transformers import AutoModel\n", + "from transformers import AutoTokenizer\n", + "\n", + "from chunked_pooling import chunked_pooling, chunk_by_sentences\n", + "\n", + "# load model and tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)\n", + "model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)" + ] + }, + { + "cell_type": "markdown", + "id": "2cc0c1162797ffb0", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Now we define the text which we want to encode and split it into chunks. The `chunk_by_sentences` function also returns the span annotations. Those specify the number of tokens per chunk which is needed for the chunked pooling." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8ef392f3437ef82e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Chunks:\n", + "- \"Berlin is the capital and largest city of Germany, both by area and by population.\"\n", + "- \" Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits.\"\n", + "- \" The city is also one of the states of Germany, and is the third smallest state in the country in terms of area.\"\n" + ] + } + ], + "source": [ + "input_text = \"Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area.\"\n", + "\n", + "# determine chunks\n", + "chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)\n", + "print('Chunks:\\n- \"' + '\"\\n- \"'.join(chunks) + '\"')\n" + ] + }, + { + "cell_type": "markdown", + "id": "9ac41fd1f0560da7", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Now we encode the chunks with the traditional and the context-sensitive chunked pooling method:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "abe3d93b9e6609b9", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# chunk before\n", + "embeddings_traditional_chunking = model.encode(chunks)\n", + "\n", + "# chunk afterwards (context-sensitive chunked pooling)\n", + "inputs = tokenizer(input_text, return_tensors='pt')\n", + "model_output = model(**inputs)\n", + "embeddings = chunked_pooling(model_output, [span_annotations])[0]" + ] + }, + { + "cell_type": "markdown", + "id": "e84b1b9d48cb6367", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Finally, we compare the similarity of the word \"Berlin\" with the chunks. The similarity should be higher for the context-sensitive chunked pooling method:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "da0cec59a3ece76", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "similarity_new(\"Berlin\", \"Berlin is the capital and largest city of Germany, both by area and by population.\"): 0.849546\n", + "similarity_trad(\"Berlin\", \"Berlin is the capital and largest city of Germany, both by area and by population.\"): 0.84862185\n", + "similarity_new(\"Berlin\", \" Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits.\"): 0.82489026\n", + "similarity_trad(\"Berlin\", \" Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits.\"): 0.7084338\n", + "similarity_new(\"Berlin\", \" The city is also one of the states of Germany, and is the third smallest state in the country in terms of area.\"): 0.84980094\n", + "similarity_trad(\"Berlin\", \" The city is also one of the states of Germany, and is the third smallest state in the country in terms of area.\"): 0.7534553\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))\n", + "\n", + "berlin_embedding = model.encode('Berlin')\n", + "\n", + "for chunk, new_embedding, trad_embeddings in zip(chunks, embeddings, embeddings_traditional_chunking):\n", + " print(f'similarity_new(\"Berlin\", \"{chunk}\"):', cos_sim(berlin_embedding, new_embedding))\n", + " print(f'similarity_trad(\"Berlin\", \"{chunk}\"):', cos_sim(berlin_embedding, trad_embeddings))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/LLMs/RAG/late-chunking/explanatory_contextual_retrieval.py b/LLMs/RAG/late-chunking/explanatory_contextual_retrieval.py new file mode 100644 index 0000000..269b518 --- /dev/null +++ b/LLMs/RAG/late-chunking/explanatory_contextual_retrieval.py @@ -0,0 +1,197 @@ +# experiments/explanatory_contextual_retrieval.py +# +# a simple example with a trivial piece of text to showcase the late chunking method against +# contextual retrieval method. contextual retrieval manually inserts context to each +# chunk, i.e. forces context to be around each chunk. so works as a good comparison +# to late chunking to see if the similarities are similar (which they appear to be) + +from chunked_pooling.wrappers import load_model +from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM +import torch +import numpy as np + +import chunked_pooling +from chunked_pooling import chunked_pooling +from chunked_pooling.chunking import Chunker + +from typing import List, Tuple +from transformers import AutoModel, AutoTokenizer, pipeline + +import requests +import os + +def request_anthropic_api(prompt: str): + url = "https://api.anthropic.com/v1/messages" + headers = { + "x-api-key": os.getenv("ANTHROPIC_API_KEY"), + "anthropic-version": "2023-06-01", + "content-type": "application/json" + } + data = { + "model": "claude-3-haiku-20240307", + "max_tokens": 2048, + "messages": [ + {"role": "user", "content": prompt} + ] + } + response = requests.post(url, headers=headers, json=data) + return response.json()["content"][0]["text"] + +def setup_local_llm(llm_name): + + model = AutoModelForCausalLM.from_pretrained(llm_name, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(llm_name, trust_remote_code=True) + + def llm(prompt): + messages = [{"role": "user", "content": prompt}] + inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") + inputs = inputs.to(model.device) + outputs = model.generate(inputs, max_new_tokens=512) + text_output = tokenizer.batch_decode(outputs)[0] + if "<|assistant|>" in text_output: + text_output = text_output.split("<|assistant|>")[1].strip() + return text_output + + return llm + +def cosine_similarity(vector1, vector2): + vector1_norm = vector1 / np.linalg.norm(vector1) + vector2_norm = vector2 / np.linalg.norm(vector2) + return np.dot(vector1_norm, vector2_norm) + +class LateChunkingEmbedder: + + def __init__(self, + model: AutoModel, + tokenizer: AutoTokenizer, + chunking_strategy: str = "sentences", + n_sentences: int = 1 + ): + + self.model = model + self.tokenizer = tokenizer + + self.chunker = Chunker(chunking_strategy = chunking_strategy) + self.n_sentences = n_sentences + + + def run(self, document: str): + annotations = [self.chunker.chunk(text=document, tokenizer=self.tokenizer, n_sentences=self.n_sentences)] + model_inputs = self.tokenizer( + document, + return_tensors='pt', + padding=True, + truncation=True, + max_length=8192, + ) + model_outputs = self.model(**model_inputs) + self.output_embs = chunked_pooling( + model_outputs, annotations, max_length=8192, + )[0] + return self.output_embs + + def query(self, query: str): + if "output_embs" not in dir(self): + raise ValueError("no embeddings calculated, use .run(document) to create chunk embeddings") + query_embedding = self.model.encode(query) + similarities = [] + for emb in self.output_embs: + similarities.append(cosine_similarity(query_embedding, emb)) + + return similarities + + +class ContextualRetrievalEmbedder(): + def __init__(self, + model: AutoModel, + tokenizer: AutoTokenizer, + llm_name: str = "microsoft/Phi-3.5-mini-instruct", + chunking_strategy: str = "fixed" + ): + + self.llm = setup_local_llm(llm_name) + # self.llm = request_anthropic_api + + self.prompt = """ + + {{WHOLE_DOCUMENT}} + + Here is the chunk we want to situate within the whole document + + {{CHUNK_CONTENT}} + + Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else. + """.strip() + + self.model = model + self.tokenizer = tokenizer + + self.chunker = Chunker(chunking_strategy = chunking_strategy) + + + def _add_context(self, chunk: str, document: str): + prompt = self.prompt.replace("{{WHOLE_DOCUMENT}}", document).replace("{{CHUNK_CONTENT}}", chunk) + extra_context = self.llm(prompt) + return extra_context + " " + chunk + + def _tokens_to_text(self, text: str, annotations: List[Tuple[int, int]]): + tokens = self.tokenizer.encode_plus( + text, return_offsets_mapping=True, add_special_tokens=False + ) + token_offsets = tokens.offset_mapping + chunks = [] + for start, end in annotations: + chunk = text[token_offsets[start][0]:token_offsets[end-1][1]] + chunks.append(chunk) + return chunks + + def run(self, document: str): + annotations = [self.chunker.chunk(text=document, tokenizer=self.tokenizer, n_sentences=1)] + self.chunks = self._tokens_to_text(text=document, annotations=annotations[0]) + self.chunks = [self._add_context(chunk, document) for chunk in self.chunks] + + model_outputs = self.model.encode(self.chunks) + self.output_embs = [model_outputs[i, :] for i in range(len(self.chunks))] + return self.output_embs + + def query(self, query: str): + if "output_embs" not in dir(self): + raise ValueError("no embeddings calculated, use .run(document) to create chunk embeddings") + query_embedding = self.model.encode(query) + similarities = [] + for emb in self.output_embs: + similarities.append(cosine_similarity(query_embedding, emb)) + + return similarities + + + +if __name__ == "__main__": + + text = """ + The recent SEC filing provided insights into ACME Corp's performance for Q2 2023. + It highlighted a 3% revenue growth over the previous quarter. + The company, which had a revenue of $314 million in the prior quarter, showed steady progress. + They attributed this growth to strategic initiatives and operational efficiencies. + The report emphasized the company's resilience and ability to navigate market challenges, reflecting positively on their financial health and future prospects. + """.strip().replace("\n", "") + + llm_model_name = "microsoft/Phi-3.5-mini-instruct" + embedding_model_name = "jinaai/jina-embeddings-v2-small-en" + + embedding_model, has_instructions = load_model(embedding_model_name) + embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name, trust_remote_code=True) + + cr = ContextualRetrievalEmbedder(embedding_model, embedding_tokenizer, llm_model_name, chunking_strategy="sentences") + cr.run(text); + cr_cosine_similarities = cr.query("What is ACME Corp's revenue growth for Q2 2023?") + + lc = LateChunkingEmbedder(embedding_model, embedding_tokenizer) + lc.run(text) + lc_cosine_similarities = lc.query("What is ACME Corp's revenue growth for Q2 2023?") + + # import pandas as pd + for i, (cr_similarity, lc_similarity) in enumerate(zip(cr_cosine_similarities, lc_cosine_similarities)): + print(f"{text.split('.')[:-1][i].strip()}") + print(f"Similarities: Contextual Retrieval: {cr_similarity:.4f} | Late Chunking: {lc_similarity:.4f}") + print("") \ No newline at end of file diff --git a/LLMs/RAG/late-chunking/img/context-problem.png b/LLMs/RAG/late-chunking/img/context-problem.png new file mode 100644 index 0000000..c3139c6 Binary files /dev/null and b/LLMs/RAG/late-chunking/img/context-problem.png differ diff --git a/LLMs/RAG/late-chunking/img/method.png b/LLMs/RAG/late-chunking/img/method.png new file mode 100644 index 0000000..9749cf6 Binary files /dev/null and b/LLMs/RAG/late-chunking/img/method.png differ diff --git a/LLMs/RAG/late-chunking/img/rag.png b/LLMs/RAG/late-chunking/img/rag.png new file mode 100644 index 0000000..0876a7f Binary files /dev/null and b/LLMs/RAG/late-chunking/img/rag.png differ diff --git a/LLMs/RAG/late-chunking/pyproject.toml b/LLMs/RAG/late-chunking/pyproject.toml new file mode 100644 index 0000000..29676ec --- /dev/null +++ b/LLMs/RAG/late-chunking/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "late_chunking" +requires-python = "~=3.8" +dependencies = [ + "jupyterlab==4.2.5", + "transformers==4.43.4", + "torch==2.4.0", + "mteb==1.14.20", + "datasets==2.19.1", + "llama-index-embeddings-huggingface==0.3.1", + "llama-index==0.11.10", + "click==8.1.7", + "einops==0.6.1", +] +version = "0.0.0" + +[project.optional-dependencies] +dev = [ + "pytest~=7.3.2", + "black==23.3.0", + "isort==5.12.0", + "ruff==0.0.265", +] + +[tool.setuptools.packages.find] +include = ["chunked_pooling"] diff --git a/LLMs/RAG/late-chunking/run_chunked_eval.py b/LLMs/RAG/late-chunking/run_chunked_eval.py new file mode 100644 index 0000000..95de94a --- /dev/null +++ b/LLMs/RAG/late-chunking/run_chunked_eval.py @@ -0,0 +1,175 @@ +import click +import torch.cuda +from mteb import MTEB +from transformers import AutoModel, AutoTokenizer + +from chunked_pooling.chunked_eval_tasks import * +from chunked_pooling.wrappers import load_model + +DEFAULT_CHUNKING_STRATEGY = 'fixed' +DEFAULT_CHUNK_SIZE = 256 +DEFAULT_N_SENTENCES = 5 +BATCH_SIZE = 1 +DEFAULT_LONG_LATE_CHUNKING_OVERLAP_SIZE = 256 +DEFAULT_LONG_LATE_CHUNKING_EMBED_SIZE = 0 # set to 0 to disable long late chunking +DEFAULT_TRUNCATE_MAX_LENGTH = None + + +@click.command() +@click.option( + '--model-name', + default='jinaai/jina-embeddings-v2-small-en', + help='The name of the model to use.', +) +@click.option( + '--model-weights', + default=None, + help='The path to the model weights to use, e.g. in case of finetuning.', +) +@click.option( + '--strategy', + default=DEFAULT_CHUNKING_STRATEGY, + help='The chunking strategy to be applied.', +) +@click.option( + '--task-name', default='SciFactChunked', help='The evaluation task to perform.' +) +@click.option( + '--eval-split', default='test', help='The name of the evaluation split in the task.' +) +@click.option( + '--chunking-model', + default=None, + required=False, + help='The name of the model used for semantic chunking.', +) +@click.option( + '--truncate-max-length', + default=DEFAULT_TRUNCATE_MAX_LENGTH, + type=int, + help='Maximum number of tokens; by default, truncation to 8192 tokens. If None, Long Late Chunking algorithm should be enabled.', +) +@click.option( + '--chunk-size', + default=DEFAULT_CHUNK_SIZE, + type=int, + help='Number of tokens per chunk for fixed strategy.', +) +@click.option( + '--n-sentences', + default=DEFAULT_N_SENTENCES, + type=int, + help='Number of sentences per chunk for sentence strategy.', +) +@click.option( + '--long-late-chunking-embed-size', + default=DEFAULT_LONG_LATE_CHUNKING_EMBED_SIZE, + type=int, + help='Number of tokens per chunk for fixed strategy.', +) +@click.option( + '--long-late-chunking-overlap-size', + default=DEFAULT_LONG_LATE_CHUNKING_OVERLAP_SIZE, + type=int, + help='Token length of the embeddings that come before/after soft boundaries (i.e. overlapping embeddings). Above zero, overlap is used between neighbouring embeddings.', +) +def main( + model_name, + model_weights, + strategy, + task_name, + eval_split, + chunking_model, + truncate_max_length, + chunk_size, + n_sentences, + long_late_chunking_embed_size, + long_late_chunking_overlap_size, +): + try: + task_cls = globals()[task_name] + except: + raise ValueError(f'Unknown task name: {task_name}') + + if truncate_max_length is not None and (long_late_chunking_embed_size > 0): + truncate_max_length = None + print( + f'Truncation is disabled because Long Late Chunking algorithm is enabled.' + ) + + model, has_instructions = load_model(model_name, model_weights) + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + chunking_args = { + 'chunk_size': chunk_size, + 'n_sentences': n_sentences, + 'chunking_strategy': strategy, + 'model_has_instructions': has_instructions, + 'embedding_model_name': chunking_model if chunking_model else model_name, + } + + if torch.cuda.is_available(): + model = model.cuda() + + model.eval() + + # Evaluate with late chunking + tasks = [ + task_cls( + chunked_pooling_enabled=True, + tokenizer=tokenizer, + prune_size=None, + truncate_max_length=truncate_max_length, + long_late_chunking_embed_size=long_late_chunking_embed_size, + long_late_chunking_overlap_size=long_late_chunking_overlap_size, + **chunking_args, + ) + ] + + evaluation = MTEB( + tasks=tasks, + chunked_pooling_enabled=True, + tokenizer=tokenizer, + prune_size=None, + **chunking_args, + ) + evaluation.run( + model, + output_folder='results-chunked-pooling', + eval_splits=[eval_split], + overwrite_results=True, + batch_size=BATCH_SIZE, + encode_kwargs={'batch_size': BATCH_SIZE}, + ) + + # Encode without late chunking + tasks = [ + task_cls( + chunked_pooling_enabled=False, + tokenizer=tokenizer, + prune_size=None, + truncate_max_length=truncate_max_length, + **chunking_args, + ) + ] + + evaluation = MTEB( + tasks=tasks, + chunked_pooling_enabled=False, + tokenizer=tokenizer, + prune_size=None, + **chunking_args, + ) + evaluation.run( + model, + output_folder='results-normal-pooling', + eval_splits=[eval_split], + overwrite_results=True, + batch_size=BATCH_SIZE, + encode_kwargs={'batch_size': BATCH_SIZE}, + ) + + +if __name__ == '__main__': + main() diff --git a/LLMs/RAG/late-chunking/tests/__init__.py b/LLMs/RAG/late-chunking/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/LLMs/RAG/late-chunking/tests/conftest.py b/LLMs/RAG/late-chunking/tests/conftest.py new file mode 100644 index 0000000..ce76ce7 --- /dev/null +++ b/LLMs/RAG/late-chunking/tests/conftest.py @@ -0,0 +1,47 @@ +import pytest +from mteb.abstasks.TaskMetadata import TaskMetadata + +from chunked_pooling.mteb_chunked_eval import AbsTaskChunkedRetrieval + + +class DummyTask(AbsTaskChunkedRetrieval): + metadata = TaskMetadata( + dataset={ + 'path': '~', + 'revision': '', + }, + name='dummy', + description='', + type='Retrieval', + category='s2p', + reference=None, + eval_splits=[], + eval_langs=[], + main_score='ndcg_at_10', + date=None, + form=None, + domains=None, + task_subtypes=None, + license=None, + socioeconomic_status=None, + annotations_creators=None, + dialect=None, + text_creation=None, + bibtex_citation=None, + n_samples=None, + avg_character_length=None, + ) + + def load_data(): + pass + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +@pytest.fixture() +def dummy_task_factory(): + def _create_dummy_task(*args, **kwargs): + return DummyTask(*args, **kwargs) + + return _create_dummy_task diff --git a/LLMs/RAG/late-chunking/tests/test_api.py b/LLMs/RAG/late-chunking/tests/test_api.py new file mode 100644 index 0000000..30962a1 --- /dev/null +++ b/LLMs/RAG/late-chunking/tests/test_api.py @@ -0,0 +1,100 @@ +import os +import numpy as np +from transformers import AutoModel, AutoTokenizer + +from chunked_pooling import chunked_pooling +from chunked_pooling.wrappers import load_model +from chunked_pooling.mteb_chunked_eval import AbsTaskChunkedRetrieval + +MODEL_NAME = 'jinaai/jina-embeddings-v3' + +# Define Text and Chunk +CHUNKS = ["Organic skincare", "for sensitive skin", "with aloe vera and chamomile"] +FULL_TEXT = ' '.join(CHUNKS) + + +def load_api_results(): + import requests + + url = 'https://api.jina.ai/v1/embeddings' + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {os.environ["JINA_API_TOKEN"]}', + } + data = { + "model": "jina-embeddings-v3", + "task": "retrieval.passage", + "dimensions": 1024, + "late_chunking": True, + "embedding_type": "float", + "input": CHUNKS, + } + response = requests.post(url, headers=headers, json=data) + data = response.json() + return [np.array(x['embedding']) for x in data['data']] + + +def calculate_annotations(model, boundary_cues, model_has_instructions, tokenizer): + if model_has_instructions: + instr = model.get_instructions()[1] + instr_tokens = tokenizer(instr, add_special_tokens=False) + n_instruction_tokens = len(instr_tokens[0]) + else: + n_instruction_tokens = 0 + chunk_annotations = [ + AbsTaskChunkedRetrieval._extend_special_tokens( + annotations, + n_instruction_tokens=n_instruction_tokens, + include_prefix=True, + include_sep=True, + ) + for annotations in boundary_cues + ] + return chunk_annotations + + +def test_compare_v3_api_embeddings(): + # Load Model + model, has_instr = load_model(MODEL_NAME, use_flash_attn=False) + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + + # Determine Boundary Cues + tokenization = tokenizer( + FULL_TEXT, return_offsets_mapping=True, add_special_tokens=False + ) + boundary_cues = [] + chunk_i = 0 + last_cue = 0 + last_end = 0 + for i, (start, end) in enumerate(tokenization.offset_mapping): + if end >= (last_end + len(CHUNKS[chunk_i])): + boundary_cues.append((last_cue, i + 1)) + chunk_i += 1 + last_cue = i + 1 + last_end = end + extended_boundary_cues = calculate_annotations( + model, [boundary_cues], has_instr, tokenizer + ) + + # Append Instruction for Retrieval Task + instr = model.get_instructions()[1] + text_inputs = [instr + FULL_TEXT] + model_inputs = tokenizer( + text_inputs, + return_tensors='pt', + padding=True, + truncation=True, + max_length=8192, + ) + model_outputs = model(**model_inputs) + + # Apply Late Chunking + output_embs = chunked_pooling( + model_outputs, extended_boundary_cues, max_length=8192 + )[0] + api_embs = load_api_results() + for local_emb, api_emb in zip(output_embs, api_embs): + local_emb_norm = local_emb / np.linalg.norm(local_emb) + api_emb_norm = api_emb / np.linalg.norm(api_emb) + assert np.allclose(local_emb_norm, api_emb_norm, rtol=1e-02, atol=1e-02) + assert 1.0 - np.dot(local_emb_norm, api_emb_norm) < 1e-3 diff --git a/LLMs/RAG/late-chunking/tests/test_chunking_methods.py b/LLMs/RAG/late-chunking/tests/test_chunking_methods.py new file mode 100644 index 0000000..02c3e17 --- /dev/null +++ b/LLMs/RAG/late-chunking/tests/test_chunking_methods.py @@ -0,0 +1,182 @@ +import pytest +from transformers import AutoTokenizer + +from chunked_pooling.chunking import CHUNKING_STRATEGIES, Chunker +from chunked_pooling.mteb_chunked_eval import AbsTaskChunkedRetrieval + +EXAMPLE_TEXT_1 = "Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area." +PUNCTATIONS = ('.', '!', '?') + + +@pytest.mark.parametrize("n_sentences", [1, 2, 3, 4]) +def test_chunk_by_sentences(n_sentences): + strategy = 'sentences' + model_name = 'jinaai/jina-embeddings-v2-small-en' + chunker = Chunker(chunking_strategy=strategy) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + boundary_cues = chunker.chunk( + text=EXAMPLE_TEXT_1, + tokenizer=tokenizer, + chunking_strategy=strategy, + n_sentences=n_sentences, + ) + extended_boundary_cues = AbsTaskChunkedRetrieval._extend_special_tokens( + boundary_cues + ) + model_inputs = tokenizer( + EXAMPLE_TEXT_1, + return_tensors='pt', + padding=True, + truncation=True, + max_length=8192, + ) + + # check that the cues start with 0 and end with the last token + assert extended_boundary_cues[0][0] == 0 + assert len(model_inputs.tokens()) == extended_boundary_cues[-1][1] + + # check that all chunks but the last one end with a punctuation + assert all( + model_inputs.tokens()[x:y][-1] in PUNCTATIONS + for (x, y) in extended_boundary_cues[:-1] + ) + + # check that the last chunk ends with a "[SEP]" token + last_cue = extended_boundary_cues[-1] + assert model_inputs.tokens()[last_cue[0] : last_cue[1]][-1] == "[SEP]" + + # check that the boundary cues are continuous (no token is missing) + assert all( + [ + extended_boundary_cues[i][1] == extended_boundary_cues[i + 1][0] + for i in range(len(extended_boundary_cues) - 1) + ] + ) + + +@pytest.mark.parametrize( + "boundary_cues", [[(0, 17), (17, 44), (44, 69)], [(0, 44), (44, 69)]] +) +def test_token_equivalence(boundary_cues): + model_name = 'jinaai/jina-embeddings-v2-small-en' + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + tokens = tokenizer.encode_plus( + EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True + ) + for start_token_idx, end_token_idx in boundary_cues: + decoded_text_chunk = tokenizer.decode( + tokens.input_ids[start_token_idx:end_token_idx] + ) + + original_text_chunk = EXAMPLE_TEXT_1[ + tokens.offset_mapping[start_token_idx][0] : tokens.offset_mapping[ + end_token_idx - 1 + ][1] + ] + chunk_tokens_original = tokenizer.encode_plus(original_text_chunk) + chunk_tokens_decoded = tokenizer.encode_plus(decoded_text_chunk) + assert chunk_tokens_original == chunk_tokens_decoded + + +def test_chunker_initialization(): + for strategy in CHUNKING_STRATEGIES: + chunker = Chunker(chunking_strategy=strategy) + assert chunker.chunking_strategy == strategy + + +def test_invalid_chunking_strategy(): + with pytest.raises(ValueError): + Chunker(chunking_strategy="invalid") + + +def test_chunk_by_tokens(): + chunker = Chunker(chunking_strategy="fixed") + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + chunks = chunker.chunk(EXAMPLE_TEXT_1, tokenizer=tokenizer, chunk_size=10) + assert len(chunks) > 1 + for start, end in chunks: + assert end - start <= 10 + + +@pytest.mark.parametrize( + 'model_name', + ['jinaai/jina-embeddings-v2-small-en', 'sentence-transformers/all-MiniLM-L6-v2'], +) +def test_chunk_semantically(model_name): + chunker = Chunker(chunking_strategy="semantic") + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokens = tokenizer.encode_plus( + EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True + ) + boundary_cues = chunker.chunk( + EXAMPLE_TEXT_1, + tokenizer=tokenizer, + chunking_strategy='semantic', + embedding_model_name=model_name, + ) + + # check if it returns boundary cues + assert len(boundary_cues) > 0 + + # test if bounaries are at the end of sentences + for start_token_idx, end_token_idx in boundary_cues: + assert ( + EXAMPLE_TEXT_1[tokens.offset_mapping[end_token_idx - 1][0]] in PUNCTATIONS + ) + decoded_text_chunk = tokenizer.decode( + tokens.input_ids[start_token_idx:end_token_idx] + ) + + # check that the boundary cues are continuous (no token is missing) + assert all( + [ + boundary_cues[i][1] == boundary_cues[i + 1][0] + for i in range(len(boundary_cues) - 1) + ] + ) + + +def test_empty_input(): + chunker = Chunker(chunking_strategy="fixed") + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + chunks = chunker.chunk("", tokenizer=tokenizer, chunk_size=10) + assert len(chunks) == 0 + + +def test_input_shorter_than_chunk_size(): + short_text = "Short text." + chunker = Chunker(chunking_strategy="fixed") + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + chunks = chunker.chunk(short_text, tokenizer=tokenizer, chunk_size=20) + assert len(chunks) == 1 + + +@pytest.mark.parametrize("chunk_size", [10, 20, 50]) +def test_various_chunk_sizes(chunk_size): + chunker = Chunker(chunking_strategy="fixed") + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + chunks = chunker.chunk(EXAMPLE_TEXT_1, tokenizer=tokenizer, chunk_size=chunk_size) + assert len(chunks) > 0 + for start, end in chunks: + assert end - start <= chunk_size + + +def test_chunk_method_with_different_strategies(): + chunker = Chunker(chunking_strategy="fixed") + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + fixed_chunks = chunker.chunk(EXAMPLE_TEXT_1, tokenizer=tokenizer, chunk_size=10) + semantic_chunks = chunker.chunk( + EXAMPLE_TEXT_1, + tokenizer=tokenizer, + chunking_strategy="semantic", + embedding_model_name='jinaai/jina-embeddings-v2-small-en', + ) + assert fixed_chunks != semantic_chunks + + +def test_chunk_by_sentences_different_n(): + chunker = Chunker(chunking_strategy="sentences") + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + chunks_1 = chunker.chunk(EXAMPLE_TEXT_1, tokenizer=tokenizer, n_sentences=1) + chunks_2 = chunker.chunk(EXAMPLE_TEXT_1, tokenizer=tokenizer, n_sentences=2) + assert len(chunks_1) > len(chunks_2) diff --git a/LLMs/RAG/late-chunking/tests/test_v3.py b/LLMs/RAG/late-chunking/tests/test_v3.py new file mode 100644 index 0000000..0038f2b --- /dev/null +++ b/LLMs/RAG/late-chunking/tests/test_v3.py @@ -0,0 +1,22 @@ +from transformers import AutoTokenizer + +from run_chunked_eval import DEFAULT_CHUNK_SIZE, load_model + +MODEL_NAME = 'jinaai/jina-embeddings-v3' + + +def test_instruction_handling(dummy_task_factory): + model, has_instructions = load_model(MODEL_NAME) + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) + task = dummy_task_factory( + chunking_strategy='fixed', + chunk_size=DEFAULT_CHUNK_SIZE, + tokenizer=tokenizer, + model_has_instructions=has_instructions, + ) + n_instruction_tokens = len( + tokenizer(model.get_instructions()[1], add_special_tokens=False)['input_ids'] + ) + annotations_one_token = task._calculate_annotations(model, ['A'])[0] + assert len(annotations_one_token) == 1 + assert annotations_one_token[0] == (0, n_instruction_tokens + 3)