From f9bbb9ab33c0702ee25e9f1d10db072c55d972cd Mon Sep 17 00:00:00 2001 From: Aditya Mulik Date: Fri, 15 May 2026 22:39:00 -0400 Subject: [PATCH] feat: Add training data transparency flags with HF Hub verification --- src/models/service.py | 52 ++++++++++++++++++++- tests/test_service.py | 103 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 1 deletion(-) diff --git a/src/models/service.py b/src/models/service.py index 02781ed..23c58c0 100644 --- a/src/models/service.py +++ b/src/models/service.py @@ -661,6 +661,30 @@ def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any] props.append({"name": "genai:aibom:modelcard:quantizationFileType", "value": str(q_dict["file_type"])}) taxonomy_mapped_keys.append("quantization") + # Training Data Completeness Check + has_training_data = self._verify_datasets_available(metadata) + props.append({"name": "genai:aibom:trainingDataAvailable", "value": "true" if has_training_data else "false"}) + + # Add status note about dataset verification + if has_training_data: + props.append({ + "name": "genai:aibom:trainingDataStatus", + "value": "Training datasets verified: Dataset(s) exist and are accessible on Hugging Face Hub." + }) + else: + # Dataset referenced but not found/verified + if "datasets" in metadata and metadata.get("datasets"): + props.append({ + "name": "genai:aibom:trainingDataWarning", + "value": "Training datasets were referenced but could not be verified on Hugging Face Hub. Dataset may not exist, be disabled, or be inaccessible." + }) + else: + # No dataset info at all + props.append({ + "name": "genai:aibom:trainingDataWarning", + "value": "Training data information is missing or not documented. This limits transparency and auditability of the model." + }) + # Basic Fields we've already mapped to structured homes mapped_fields = [ "primaryPurpose", "typeOfModel", "suppliedBy", "intendedUse", @@ -668,7 +692,8 @@ def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any] "pipeline_tag", "name", "author", "license", "description", "commit", "bomFormat", "specVersion", "version", "licenses", "external_references", "tags", "library_name", "paper", "downloadLocation", - "gguf_filename", "gguf_license", "model_type", "architectures" + "gguf_filename", "gguf_license", "model_type", "architectures", + "trainingDataAvailable", "trainingDataWarning" ] + taxonomy_mapped_keys for k, v in metadata.items(): @@ -719,3 +744,28 @@ def _infer_io_formats(self, task: str) -> tuple: return (["csv", "json"], ["string", "number"]) return ([], []) + + def _verify_datasets_available(self, metadata: Dict[str, Any]) -> bool: + """Verify if training datasets exist on Hugging Face Hub.""" + datasets = metadata.get("datasets") + if not datasets: + return False + + # Normalize to list + if isinstance(datasets, str): + datasets = [datasets] + elif isinstance(datasets, dict): + datasets = [datasets.get("name", "")] + + # Filter out empty/placeholder values + valid = [d for d in datasets if isinstance(d, str) and d.strip() and d.lower() != "unknown"] + + return any(self._verify_dataset_exists_on_hf(d) for d in valid) if valid else False + + def _verify_dataset_exists_on_hf(self, dataset_id: str) -> bool: + """Check if dataset exists and is accessible on HF Hub.""" + try: + info = self.hf_api.dataset_info(repo_id=dataset_id) + return info is not None and not getattr(info, 'disabled', False) + except Exception: + return False diff --git a/tests/test_service.py b/tests/test_service.py index fcb8cf5..ec51915 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -119,5 +119,108 @@ def test_generate_purl_no_namespace(self): purl = self.service._generate_purl("model", "1.0") self.assertEqual(purl, "pkg:huggingface/model@1.0") + @patch("src.models.service.calculate_completeness_score") + @patch("src.models.service.EnhancedExtractor") + def test_training_data_flag_with_datasets(self, mock_extractor_cls, mock_score): + """Test that trainingDataAvailable flag is set to true when datasets are present""" + # Setup + mock_extractor = mock_extractor_cls.return_value + metadata_with_data = { + "name": "test-model", + "datasets": ["dataset1", "dataset2"], + "commit": "123456" + } + mock_extractor.extract_metadata.return_value = metadata_with_data + mock_extractor.extraction_results = {} + mock_score.return_value = {"total_score": 50} + + self.service.hf_api.model_info.return_value = MagicMock(sha="123456") + + # Mock dataset verification + with patch.object(self.service, '_verify_dataset_exists_on_hf', return_value=True): + # Action + aibom = self.service.generate_aibom("owner/model") + + # Verify + model_card = aibom["components"][0].get("modelCard", {}) + properties = model_card.get("properties", []) + + # Find the trainingDataAvailable property + training_flag = next((p for p in properties if p["name"] == "genai:aibom:trainingDataAvailable"), None) + self.assertIsNotNone(training_flag) + self.assertEqual(training_flag["value"], "true") + + # Verify no warning + warning = next((p for p in properties if p["name"] == "genai:aibom:trainingDataWarning"), None) + self.assertIsNone(warning) + + @patch("src.models.service.calculate_completeness_score") + @patch("src.models.service.EnhancedExtractor") + def test_training_data_flag_without_datasets(self, mock_extractor_cls, mock_score): + """Test that trainingDataAvailable flag is set to false and warning is added when datasets are missing""" + # Setup + mock_extractor = mock_extractor_cls.return_value + metadata_no_data = { + "name": "test-model", + "commit": "123456" + # No datasets key + } + mock_extractor.extract_metadata.return_value = metadata_no_data + mock_extractor.extraction_results = {} + mock_score.return_value = {"total_score": 50} + + self.service.hf_api.model_info.return_value = MagicMock(sha="123456") + + # Action + aibom = self.service.generate_aibom("owner/model") + + # Verify + model_card = aibom["components"][0].get("modelCard", {}) + properties = model_card.get("properties", []) + + # Find the trainingDataAvailable property + training_flag = next((p for p in properties if p["name"] == "genai:aibom:trainingDataAvailable"), None) + self.assertIsNotNone(training_flag) + self.assertEqual(training_flag["value"], "false") + + # Verify warning is present + warning = next((p for p in properties if p["name"] == "genai:aibom:trainingDataWarning"), None) + self.assertIsNotNone(warning) + self.assertIn("Training data information is missing", warning["value"]) + + def test_verify_datasets_available_with_valid_datasets(self): + """Test dataset verification with valid datasets""" + # Mock the HF API call + with patch.object(self.service, '_verify_dataset_exists_on_hf', return_value=True): + # List of valid datasets + metadata = {"datasets": ["dataset1", "dataset2"]} + self.assertTrue(self.service._verify_datasets_available(metadata)) + + # Single string dataset + metadata = {"datasets": "valid_dataset"} + self.assertTrue(self.service._verify_datasets_available(metadata)) + + # Dict format with name + metadata = {"datasets": {"name": "my_dataset", "url": "https://example.com"}} + self.assertTrue(self.service._verify_datasets_available(metadata)) + + def test_verify_datasets_available_with_empty_datasets(self): + """Test dataset verification with empty or invalid datasets""" + # Empty list + metadata = {"datasets": []} + self.assertFalse(self.service._verify_datasets_available(metadata)) + + # List with empty strings + metadata = {"datasets": ["", " ", ""]} + self.assertFalse(self.service._verify_datasets_available(metadata)) + + # Unknown placeholder + metadata = {"datasets": ["unknown"]} + self.assertFalse(self.service._verify_datasets_available(metadata)) + + # No datasets key + metadata = {"name": "test-model"} + self.assertFalse(self.service._verify_datasets_available(metadata)) + if __name__ == '__main__': unittest.main()