Merge branch 'main' into copyright

IntelLabs · Jan 9, 2025 · dc834d6 · dc834d6
2 parents ec8b290 + 78ea246
commit dc834d6
Show file tree

Hide file tree

Showing 24 changed files with 83 additions and 12 deletions.
diff --git a/file_utils.py b/file_utils.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import pickle
 

diff --git a/plots/plot_gold-search-recall.py b/plots/plot_gold-search-recall.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np

diff --git a/plots/plot_ndoc-recall.py b/plots/plot_ndoc-recall.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np

diff --git a/plots/plot_noise_percentile.py b/plots/plot_noise_percentile.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import numpy as np
 import matplotlib.pyplot as plt
 import plot_utils

diff --git a/plots/plot_utils.py b/plots/plot_utils.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import numpy as np
 import os

diff --git a/preprocessing/alce/convert_alce_colbert.py b/preprocessing/alce/convert_alce_colbert.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from file_utils import load_json
 import convert_alce_utils
 

diff --git a/preprocessing/alce/convert_alce_dense.py b/preprocessing/alce/convert_alce_dense.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from file_utils import load_json, save_json, save_jsonl
 import convert_alce_utils
 

diff --git a/preprocessing/alce/convert_alce_utils.py b/preprocessing/alce/convert_alce_utils.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from tqdm import tqdm
 import pandas as pd
 

diff --git a/preprocessing/convert_nq_dense.py b/preprocessing/convert_nq_dense.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import datasets
 import json

diff --git a/preprocessing/create_groundtruth_calibration.py b/preprocessing/create_groundtruth_calibration.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 """
 After exhaustive search with a flat search index has been run, you will have files that contain the nearest neighbors
 for every single query.

diff --git a/preprocessing/sample_retrieved_neighbors.py b/preprocessing/sample_retrieved_neighbors.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import argparse
 from tqdm import tqdm

diff --git a/preprocessing/set_gold_recall.py b/preprocessing/set_gold_recall.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import argparse
 from tqdm import tqdm

diff --git a/reader/compute_ci.py b/reader/compute_ci.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import numpy as np
 import pathlib

diff --git a/reader/eval.py b/reader/eval.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # Some of this code is based on prior work under the MIT License:
 #   Copyright (c) 2023 Princeton Natural Language Processing
 #   Copyright (c) Carnegie Mellon University 

diff --git a/reader/eval_per_query.py b/reader/eval_per_query.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import collections
 from collections import Counter

diff --git a/reader/plot_per_k.py b/reader/plot_per_k.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import os
 import logging

diff --git a/reader/run.py b/reader/run.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # Some of this code is based on prior work under the MIT License:
 #   Copyright (c) 2023 Princeton Natural Language Processing
 

diff --git a/reader/utils.py b/reader/utils.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # Some of this code is based on prior work under the MIT License:
 #   Copyright (c) 2023 Princeton Natural Language Processing
 

diff --git a/retriever/eval.py b/retriever/eval.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # Some of this code is based on prior work under the MIT License:
 #   Copyright (c) 2023 Princeton Natural Language Processing
 

diff --git a/retriever/index.py b/retriever/index.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import numpy as np
 import os

diff --git a/retriever/ret_utils.py b/retriever/ret_utils.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import numpy as np
 import torch
 import os

diff --git a/retriever/run.py b/retriever/run.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # Some of this code is based on prior work under the MIT License:
 #   Copyright (c) 2023 Princeton Natural Language Processing
 
@@ -62,14 +65,14 @@ def dense_random_retrieval(
 
     import gc
 
-    if not load_search_results:
+    if load_search_results is None:
         index_path = os.path.join(INDEX_PATH, 'dense', embed_file.split(".fvecs")[0])
         vec_file = os.path.join(VEC_PATH, embed_file)
         # Optimal configuration is to set the number of threads to the batch size
         index_kwargs.update({'num_threads': num_threads})
 
         logger.info('Start indexing...')
-        search_index_og = index.dense_build_index(
+        search_index = index.dense_build_index(
             index_path,
             vec_file,
             index_fn,
@@ -80,11 +83,11 @@ def dense_random_retrieval(
         )
         logger.info('Done indexing')
 
-        if embed_model_type == 'st':
-            import sentence_transformers as st
-            embed_model = st.SentenceTransformer(embed_model_name)
-        else:
-            raise NotImplementedError('Need to implement alternate type of embedding model')
+    if embed_model_type == 'st':
+        import sentence_transformers as st
+        embed_model = st.SentenceTransformer(embed_model_name)
+    else:
+        raise NotImplementedError('Need to implement alternate type of embedding model')
 
     logger.info('Embedding and batching queries...')
 
@@ -99,20 +102,19 @@ def dense_random_retrieval(
         logger.info(f"Batch size: {len(queries)}")
         query_data = query_data_batches[batch_id]
 
-        if load_search_results:
-            batch_load_results = load_search_results.replace("*", str(batch_id))
-            k_neighbors, dist_neighbors = load_pickle(batch_load_results, logger)
-        else: 
+        if load_search_results is None: 
             query_embs = embed_model.encode(queries)
 
-            search_index = search_index_og
             logger.info(f"Start searching for {k} neighbors per query...")
             k_neighbors, dist_neighbors = search_index.search(query_embs, k)
             logger.info('Done searching')
 
             # Save direct search outputs before writing to JSON
             save_pickle([k_neighbors, dist_neighbors], f'{doc_dataset}_tmp_batch-{batch_id}.pkl', logger)
             gc.collect()
+        elif load_search_results:
+            batch_load_results = load_search_results.replace("*", str(batch_id))
+            k_neighbors, dist_neighbors = load_pickle(batch_load_results, logger)
 
         logger.info('Loading text corpus and document titles to associate with neighbors')
         corpus = datasets.load_dataset("json", data_files=corpus_file)

diff --git a/retriever/run_colbert.py b/retriever/run_colbert.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import sys
 import logging

diff --git a/utils.py b/utils.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 class InvalidArgument(Exception):
     """raise when user input arguments are invalid"""
     pass