|
2 | 2 | import pathlib
|
3 | 3 | import shutil
|
4 | 4 | from pathlib import Path
|
| 5 | +from unittest.mock import MagicMock, patch |
5 | 6 |
|
6 | 7 | import numpy as np
|
7 | 8 | import pandas as pd
|
|
57 | 58 | )
|
58 | 59 |
|
59 | 60 |
|
| 61 | +@pytest.fixture |
| 62 | +def mock_dependencies(): |
| 63 | + with patch( |
| 64 | + "unstructured.metrics.evaluate.calculate_accuracy" |
| 65 | + ) as mock_calculate_accuracy, patch( |
| 66 | + "unstructured.metrics.evaluate.calculate_percent_missing_text" |
| 67 | + ) as mock_calculate_percent_missing_text, patch.object( |
| 68 | + TextExtractionMetricsCalculator, "_get_ccts" |
| 69 | + ) as mock_get_ccts, patch( |
| 70 | + "unstructured.metrics.evaluate.get_element_type_frequency" |
| 71 | + ) as mock_get_element_type_frequency, patch( |
| 72 | + "unstructured.metrics.evaluate.calculate_element_type_percent_match" |
| 73 | + ) as mock_calculate_element_type_percent_match, patch( |
| 74 | + "unstructured.metrics.evaluate._read_text_file" |
| 75 | + ) as mock_read_text_file, patch.object( |
| 76 | + Path, "exists" |
| 77 | + ) as mock_path_exists, patch( |
| 78 | + "unstructured.metrics.evaluate.TableEvalProcessor.from_json_files" |
| 79 | + ) as mock_table_eval_processor_from_json_files, patch.object( |
| 80 | + TableStructureMetricsCalculator, "supported_metric_names" |
| 81 | + ) as mock_supported_metric_names: |
| 82 | + mocks = { |
| 83 | + "mock_calculate_accuracy": mock_calculate_accuracy, |
| 84 | + "mock_calculate_percent_missing_text": mock_calculate_percent_missing_text, |
| 85 | + "mock_get_ccts": mock_get_ccts, |
| 86 | + "mock_get_element_type_frequency": mock_get_element_type_frequency, |
| 87 | + "mock_read_text_file": mock_read_text_file, |
| 88 | + "mock_calculate_element_type_percent_match": mock_calculate_element_type_percent_match, |
| 89 | + "mock_table_eval_processor_from_json_files": mock_table_eval_processor_from_json_files, |
| 90 | + "mock_supported_metric_names": mock_supported_metric_names, |
| 91 | + "mock_path_exists": mock_path_exists, |
| 92 | + } |
| 93 | + |
| 94 | + # setup mocks |
| 95 | + mocks["mock_calculate_accuracy"].return_value = 0.5 |
| 96 | + mocks["mock_calculate_percent_missing_text"].return_value = 0.5 |
| 97 | + mocks["mock_get_ccts"].return_value = ["output_cct", "source_cct"] |
| 98 | + mocks["mock_get_element_type_frequency"].side_effect = [{"ele1": 1}, {"ele2": 3}] |
| 99 | + mocks["mock_calculate_element_type_percent_match"].return_value = 0.5 |
| 100 | + mocks["mock_supported_metric_names"].return_value = ["table_level_acc"] |
| 101 | + mocks["mock_path_exists"].return_value = True |
| 102 | + mocks["mock_read_text_file"].side_effect = ["output_text", "source_text"] |
| 103 | + |
| 104 | + yield mocks |
| 105 | + |
| 106 | + |
60 | 107 | @pytest.fixture()
|
61 | 108 | def _cleanup_after_test():
|
62 | 109 | """Fixture for removing side-effects of running tests in this file."""
|
@@ -139,6 +186,114 @@ def test_process_document_returns_the_correct_amount_of_values(
|
139 | 186 | assert len(output_list) == expected_length
|
140 | 187 |
|
141 | 188 |
|
| 189 | +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") |
| 190 | +@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies") |
| 191 | +@pytest.mark.parametrize( |
| 192 | + ("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"), |
| 193 | + [ |
| 194 | + ( |
| 195 | + TextExtractionMetricsCalculator, |
| 196 | + UNSTRUCTURED_CCT_DIRNAME, |
| 197 | + GOLD_CCT_DIRNAME, |
| 198 | + Path("2310.03502text_to_image_synthesis1-7.pdf.txt"), |
| 199 | + {"document_type": "txt"}, |
| 200 | + ), |
| 201 | + ], |
| 202 | +) |
| 203 | +def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype( |
| 204 | + mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs |
| 205 | +): |
| 206 | + |
| 207 | + output_dir = Path(TESTING_FILE_DIR) / output_dirname |
| 208 | + source_dir = Path(TESTING_FILE_DIR) / source_dirname |
| 209 | + mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"] |
| 210 | + mock_calculate_percent_missing_text = mock_dependencies["mock_calculate_percent_missing_text"] |
| 211 | + mock_get_ccts = mock_dependencies["mock_get_ccts"] |
| 212 | + calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs) |
| 213 | + output_list = calculator._process_document(path) |
| 214 | + assert output_list[1] == ".pdf" |
| 215 | + assert mock_calculate_accuracy.call_count == 1 |
| 216 | + assert mock_calculate_percent_missing_text.call_count == 1 |
| 217 | + assert mock_get_ccts.call_count == 1 |
| 218 | + |
| 219 | + |
| 220 | +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") |
| 221 | +@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies") |
| 222 | +@pytest.mark.parametrize( |
| 223 | + ("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"), |
| 224 | + [ |
| 225 | + ( |
| 226 | + TableStructureMetricsCalculator, |
| 227 | + UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME, |
| 228 | + GOLD_TABLE_STRUCTURE_DIRNAME, |
| 229 | + Path("tablib-627mTABLES-2310.07875-p7.pdf.json"), |
| 230 | + {}, |
| 231 | + ), |
| 232 | + # ( |
| 233 | + # ElementTypeMetricsCalculator, |
| 234 | + # UNSTRUCTURED_OUTPUT_DIRNAME, |
| 235 | + # GOLD_ELEMENT_TYPE_DIRNAME, |
| 236 | + # Path("IRS-form.1987.pdf.json"), |
| 237 | + # {}, |
| 238 | + # ), |
| 239 | + ], |
| 240 | +) |
| 241 | +def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype( |
| 242 | + mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs |
| 243 | +): |
| 244 | + |
| 245 | + output_dir = Path(TESTING_FILE_DIR) / output_dirname |
| 246 | + source_dir = Path(TESTING_FILE_DIR) / source_dirname |
| 247 | + calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs) |
| 248 | + calculator._ground_truths_dir = source_dir |
| 249 | + calculator._documents_dir = output_dir |
| 250 | + calculator._ground_truth_paths = [source_dir / path] |
| 251 | + mock_report = MagicMock() |
| 252 | + mock_report.total_predicted_tables = 3 |
| 253 | + mock_report.table_evel_acc = 0.83 |
| 254 | + mock_table_eval_processor_from_json_files = mock_dependencies[ |
| 255 | + "mock_table_eval_processor_from_json_files" |
| 256 | + ] |
| 257 | + mock_table_eval_processor_from_json_files.return_value.process_file.return_value = mock_report |
| 258 | + |
| 259 | + output_list = calculator._process_document(path) |
| 260 | + assert output_list[1] == ".pdf" |
| 261 | + assert mock_table_eval_processor_from_json_files.call_count == 1 |
| 262 | + |
| 263 | + |
| 264 | +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") |
| 265 | +@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies") |
| 266 | +@pytest.mark.parametrize( |
| 267 | + ("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"), |
| 268 | + [ |
| 269 | + ( |
| 270 | + ElementTypeMetricsCalculator, |
| 271 | + UNSTRUCTURED_OUTPUT_DIRNAME, |
| 272 | + GOLD_ELEMENT_TYPE_DIRNAME, |
| 273 | + Path("IRS-form.1987.pdf.json"), |
| 274 | + {}, |
| 275 | + ), |
| 276 | + ], |
| 277 | +) |
| 278 | +def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype( |
| 279 | + mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs |
| 280 | +): |
| 281 | + |
| 282 | + output_dir = Path(TESTING_FILE_DIR) / output_dirname |
| 283 | + source_dir = Path(TESTING_FILE_DIR) / source_dirname |
| 284 | + calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs) |
| 285 | + mock_element_type_frequency = mock_dependencies["mock_get_element_type_frequency"] |
| 286 | + mock_read_text_file = mock_dependencies["mock_read_text_file"] |
| 287 | + mock_calculate_element_type_percent_match = mock_dependencies[ |
| 288 | + "mock_calculate_element_type_percent_match" |
| 289 | + ] |
| 290 | + output_list = calculator._process_document(path) |
| 291 | + assert output_list[1] == ".pdf" |
| 292 | + assert mock_read_text_file.call_count == 2 |
| 293 | + assert mock_element_type_frequency.call_count == 2 |
| 294 | + assert mock_calculate_element_type_percent_match.call_count == 1 |
| 295 | + |
| 296 | + |
142 | 297 | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
143 | 298 | @pytest.mark.usefixtures("_cleanup_after_test")
|
144 | 299 | def test_text_extraction_evaluation_type_txt():
|
|
0 commit comments