DefectDojo
diff --git a/‎dojo/finding/deduplication.py‎
Lines changed: 564 additions & 0 deletions b/‎dojo/finding/deduplication.py‎
Lines changed: 564 additions & 0 deletions
diff --git a/‎dojo/finding/helper.py‎
Lines changed: 58 additions & 1 deletion b/‎dojo/finding/helper.py‎
Lines changed: 58 additions & 1 deletion
diff --git a/‎dojo/importers/default_importer.py‎
Lines changed: 31 additions & 29 deletions b/‎dojo/importers/default_importer.py‎
Lines changed: 31 additions & 29 deletions
diff --git a/‎dojo/importers/default_reimporter.py‎
Lines changed: 33 additions & 27 deletions b/‎dojo/importers/default_reimporter.py‎
Lines changed: 33 additions & 27 deletions
diff --git a/‎dojo/management/commands/clear_celery_queue.py‎
Lines changed: 115 additions & 0 deletions b/‎dojo/management/commands/clear_celery_queue.py‎
Lines changed: 115 additions & 0 deletions
@@ -17,6 +17,11 @@
 from dojo.decorators import dojo_async_task, dojo_model_from_id, dojo_model_to_id
 from dojo.endpoint.utils import save_endpoints_to_add
 from dojo.file_uploads.helper import delete_related_files
+from dojo.finding.deduplication import (
+    dedupe_batch_of_findings,
+    do_dedupe_finding,
+    get_finding_models_for_deduplication,
+)
 from dojo.models import (
     Endpoint,
     Endpoint_Status,
@@ -35,7 +40,6 @@
 from dojo.utils import (
     calculate_grade,
     close_external_issue,
-    do_dedupe_finding,
     do_false_positive_history,
     get_current_user,
     mass_model_updater,
@@ -457,6 +461,59 @@ def post_process_finding_save_internal(finding, dedupe_option=True, rules_option
             jira_helper.push_to_jira(finding.finding_group)
 
 
+@dojo_async_task(signature=True)
+@app.task
+def post_process_findings_batch_signature(finding_ids, *args, dedupe_option=True, rules_option=True, product_grading_option=True,
+             issue_updater_option=True, push_to_jira=False, user=None, **kwargs):
+    return post_process_findings_batch(finding_ids, dedupe_option, rules_option, product_grading_option,
+                                       issue_updater_option, push_to_jira, user, **kwargs)
+
+
+@dojo_async_task
+@app.task
+def post_process_findings_batch(finding_ids, *args, dedupe_option=True, rules_option=True, product_grading_option=True,
+             issue_updater_option=True, push_to_jira=False, user=None, **kwargs):
+
+    if not finding_ids:
+        return
+
+    system_settings = System_Settings.objects.get()
+
+    # use list() to force a complete query execution and related objects to be loaded once
+    findings = get_finding_models_for_deduplication(finding_ids)
+
+    if not findings:
+        logger.debug(f"no findings found for batch deduplication with IDs: {finding_ids}")
+        return
+
+    # Batch dedupe with single queries per algorithm; fallback to per-finding for anything else
+    if dedupe_option and system_settings.enable_deduplication:
+        dedupe_batch_of_findings(findings)
+
+    if system_settings.false_positive_history:
+        # Only perform false positive history if deduplication is disabled
+        if system_settings.enable_deduplication:
+            deduplicationLogger.warning("skipping false positive history because deduplication is also enabled")
+        else:
+            for finding in findings:
+                do_false_positive_history(finding, *args, **kwargs)
+
+    # Non-status changing tasks
+    if issue_updater_option:
+        for finding in findings:
+            tool_issue_updater.async_tool_issue_update(finding)
+
+    if product_grading_option and system_settings.enable_product_grade:
+        calculate_grade(findings[0].test.engagement.product)
+
+    if push_to_jira:
+        for finding in findings:
+            if finding.has_jira_issue or not finding.finding_group:
+                jira_helper.push_to_jira(finding)
+            else:
+                jira_helper.push_to_jira(finding.finding_group)
+
+
 @receiver(pre_delete, sender=Finding)
 def finding_pre_delete(sender, instance, **kwargs):
     logger.debug("finding pre_delete: %d", instance.id)
 
@@ -1,5 +1,6 @@
 import logging
 
+from django.conf import settings
 from django.core.files.uploadedfile import TemporaryUploadedFile
 from django.core.serializers import serialize
 from django.db.models.query_utils import Q
@@ -157,10 +158,9 @@ def process_findings(
         parsed_findings: list[Finding],
         **kwargs: dict,
     ) -> list[Finding]:
-        # Progressive batching for chord execution
-        post_processing_task_signatures = []
-        current_batch_number = 1
-        max_batch_size = 1024
+        # Batched post-processing (no chord): dispatch a task per 1000 findings or on final finding
+        batch_finding_ids: list[int] = []
+        batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000)
 
         """
         Saves findings in memory that were parsed from the scan report into the database.
@@ -237,32 +237,34 @@ def process_findings(
             finding = self.process_vulnerability_ids(finding)
             # Categorize this finding as a new one
             new_findings.append(finding)
-            # all data is already saved on the finding, we only need to trigger post processing
-
-            # We create a signature for the post processing task so we can decide to apply it async or sync
+            # all data is already saved on the finding, we only need to trigger post processing in batches
             push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
-            post_processing_task_signature = finding_helper.post_process_finding_save_signature(
-                finding,
-                dedupe_option=True,
-                rules_option=True,
-                product_grading_option=False,
-                issue_updater_option=True,
-                push_to_jira=push_to_jira,
-            )
-
-            post_processing_task_signatures.append(post_processing_task_signature)
-
-            # Check if we should launch a chord (batch full or end of findings)
-            if we_want_async(async_user=self.user) and post_processing_task_signatures:
-                post_processing_task_signatures, current_batch_number, _ = self.maybe_launch_post_processing_chord(
-                    post_processing_task_signatures,
-                    current_batch_number,
-                    max_batch_size,
-                    is_final_finding,
-                )
-            else:
-                # Execute task immediately for synchronous processing
-                post_processing_task_signature()
+            batch_finding_ids.append(finding.id)
+
+            # If batch is full or we're at the end, dispatch one batched task
+            if len(batch_finding_ids) >= batch_max_size or is_final_finding:
+                finding_ids_batch = list(batch_finding_ids)
+                batch_finding_ids.clear()
+                if we_want_async(async_user=self.user):
+                    finding_helper.post_process_findings_batch_signature(
+                        finding_ids_batch,
+                        dedupe_option=True,
+                        rules_option=True,
+                        product_grading_option=True,
+                        issue_updater_option=True,
+                        push_to_jira=push_to_jira,
+                    )()
+                else:
+                    finding_helper.post_process_findings_batch(
+                        finding_ids_batch,
+                        dedupe_option=True,
+                        rules_option=True,
+                        product_grading_option=True,
+                        issue_updater_option=True,
+                        push_to_jira=push_to_jira,
+                    )
+
+            # No chord: tasks are dispatched immediately above per batch
 
         for (group_name, findings) in group_names_to_findings_dict.items():
             finding_helper.add_findings_to_auto_group(
 
@@ -183,9 +183,7 @@ def process_findings(
         self.unchanged_items = []
         self.group_names_to_findings_dict = {}
         # Progressive batching for chord execution
-        post_processing_task_signatures = []
-        current_batch_number = 1
-        max_batch_size = 1024
+        # No chord: we dispatch per 1000 findings or on the final finding
 
         logger.debug(f"starting reimport of {len(parsed_findings) if parsed_findings else 0} items.")
         logger.debug("STEP 1: looping over findings from the reimported report and trying to match them to existing findings")
@@ -205,6 +203,9 @@ def process_findings(
                 continue
             cleaned_findings.append(sanitized)
 
+        batch_finding_ids: list[int] = []
+        batch_max_size = 1000
+
         for idx, unsaved_finding in enumerate(cleaned_findings):
             is_final = idx == len(cleaned_findings) - 1
             # Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report)
@@ -255,31 +256,34 @@ def process_findings(
                     finding,
                     unsaved_finding,
                 )
-                # all data is already saved on the finding, we only need to trigger post processing
-
-                # Execute post-processing task immediately if async, otherwise execute synchronously
+                # all data is already saved on the finding, we only need to trigger post processing in batches
                 push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
-
-                post_processing_task_signature = finding_helper.post_process_finding_save_signature(
-                    finding,
-                    dedupe_option=True,
-                    rules_option=True,
-                    product_grading_option=False,
-                    issue_updater_option=True,
-                    push_to_jira=push_to_jira,
-                )
-                post_processing_task_signatures.append(post_processing_task_signature)
-
-            # Check if we should launch a chord (batch full or end of findings)
-            if we_want_async(async_user=self.user) and post_processing_task_signatures:
-                post_processing_task_signatures, current_batch_number, _ = self.maybe_launch_post_processing_chord(
-                    post_processing_task_signatures,
-                    current_batch_number,
-                    max_batch_size,
-                    is_final,
-                )
-            else:
-                post_processing_task_signature()
+                batch_finding_ids.append(finding.id)
+
+                # If batch is full or we're at the end, dispatch one batched task
+                if len(batch_finding_ids) >= batch_max_size or is_final:
+                    finding_ids_batch = list(batch_finding_ids)
+                    batch_finding_ids.clear()
+                    if we_want_async(async_user=self.user):
+                        finding_helper.post_process_findings_batch_signature(
+                            finding_ids_batch,
+                            dedupe_option=True,
+                            rules_option=True,
+                            product_grading_option=True,
+                            issue_updater_option=True,
+                            push_to_jira=push_to_jira,
+                        )()
+                    else:
+                        finding_helper.post_process_findings_batch(
+                            finding_ids_batch,
+                            dedupe_option=True,
+                            rules_option=True,
+                            product_grading_option=True,
+                            issue_updater_option=True,
+                            push_to_jira=push_to_jira,
+                        )
+
+            # No chord: tasks are dispatched immediately above per batch
 
         self.to_mitigate = (set(self.original_items) - set(self.reactivated_items) - set(self.unchanged_items))
         # due to #3958 we can have duplicates inside the same report
@@ -779,4 +783,6 @@ def calculate_unsaved_finding_hash_code(
         self,
         unsaved_finding: Finding,
     ) -> str:
+        # this is overridden in Pro, but will still call this via super()
+        deduplicationLogger.debug("Calculating hash code for unsaved finding")
         return unsaved_finding.compute_hash_code()
@@ -0,0 +1,115 @@
+import logging
+
+from django.core.management.base import BaseCommand
+
+from dojo.celery import app
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    help = "Clear (purge) all tasks from Celery queues"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--queue",
+            type=str,
+            help="Specific queue name to clear (default: all queues)",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="Show what would be cleared without actually clearing",
+        )
+        parser.add_argument(
+            "--force",
+            action="store_true",
+            help="Skip confirmation prompt (use with caution)",
+        )
+
+    def handle(self, *args, **options):
+        queue_name = options["queue"]
+        dry_run = options["dry_run"]
+        force = options["force"]
+
+        # Get connection to broker
+        with app.connection() as conn:
+            # Get all queues or specific queue
+            if queue_name:
+                queues = [queue_name]
+                self.stdout.write(f"Targeting queue: {queue_name}")
+            else:
+                # Get all active queues from the broker
+                inspector = app.control.inspect()
+                active_queues = inspector.active_queues()
+                if active_queues:
+                    # Extract unique queue names from all workers
+                    queues = set()
+                    for worker_queues in active_queues.values():
+                        queues.update(queue_info["name"] for queue_info in worker_queues)
+                    queues = list(queues)
+                else:
+                    # Fallback: try common default queue
+                    queues = ["celery"]
+                self.stdout.write(f"Found {len(queues)} queue(s) to process")
+
+            if not queues:
+                self.stdout.write(self.style.WARNING("No queues found to clear"))
+                return
+
+            # Show what will be cleared
+            total_purged = 0
+            for queue in queues:
+                try:
+                    # Get queue length using channel
+                    with conn.channel() as channel:
+                        _, message_count, _ = channel.queue_declare(queue=queue, passive=True)
+                except Exception as e:
+                    logger.debug(f"Could not get message count for queue {queue}: {e}")
+                    message_count = "unknown"
+
+                if dry_run:
+                    self.stdout.write(
+                        self.style.WARNING(f"  Would purge {message_count} messages from queue: {queue}"),
+                    )
+                else:
+                    self.stdout.write(f"  Queue '{queue}': {message_count} messages")
+
+            if dry_run:
+                self.stdout.write(self.style.SUCCESS("\nDry run complete. Use without --dry-run to actually purge."))
+                return
+
+            # Confirmation prompt
+            if not force:
+                self.stdout.write(
+                    self.style.WARNING(
+                        f"\nThis will permanently delete all messages from {len(queues)} queue(s).",
+                    ),
+                )
+                confirm = input("Are you sure you want to continue? (yes/no): ")
+                if confirm.lower() not in {"yes", "y"}:
+                    self.stdout.write(self.style.ERROR("Operation cancelled."))
+                    return
+
+            # Purge queues using direct channel purge
+            self.stdout.write("\nPurging queues...")
+            for queue in queues:
+                try:
+                    with conn.channel() as channel:
+                        purged_count = channel.queue_purge(queue=queue)
+                        total_purged += purged_count
+                        self.stdout.write(
+                            self.style.SUCCESS(f"  ✓ Purged {purged_count} messages from queue: {queue}"),
+                        )
+                except Exception as e:
+                    self.stdout.write(
+                        self.style.ERROR(f"  ✗ Failed to purge queue '{queue}': {e}"),
+                    )
+                    logger.error(f"Error purging queue {queue}: {e}")
+
+            if total_purged > 0:
+                self.stdout.write(
+                    self.style.SUCCESS(f"\nSuccessfully purged {total_purged} message(s) from {len(queues)} queue(s)."),
+                )
+            else:
+                self.stdout.write(self.style.WARNING("\nNo messages were purged (queues may have been empty)."))