From 74f15923d08ca72669fbb0dcf8f6478458091348 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Tue, 18 Nov 2025 10:50:53 +0100 Subject: [PATCH 01/15] Antithesis poc --- dd-trace-core/build.gradle | 3 + .../common/writer/PayloadDispatcherImpl.java | 55 ++++++++++++++++ .../trace/common/writer/RemoteWriter.java | 38 ++++++++++++ .../common/writer/ddagent/DDAgentApi.java | 62 +++++++++++++++++++ telemetry/build.gradle.kts | 4 ++ .../datadog/telemetry/TelemetryClient.java | 58 +++++++++++++++++ .../datadog/telemetry/TelemetryRouter.java | 39 ++++++++++++ 7 files changed, 259 insertions(+) diff --git a/dd-trace-core/build.gradle b/dd-trace-core/build.gradle index 7b111ed4e38..110a0f380e4 100644 --- a/dd-trace-core/build.gradle +++ b/dd-trace-core/build.gradle @@ -80,6 +80,9 @@ dependencies { implementation group: 'com.google.re2j', name: 're2j', version: '1.7' + // Antithesis SDK for assertions and property testing + implementation group: 'com.antithesis', name: 'antithesis-sdk-java', version: '0.1.5' + compileOnly group: 'com.github.spotbugs', name: 'spotbugs-annotations', version: '4.2.0' // We have autoservices defined in test subtree, looks like we need this to be able to properly rebuild this diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index a0011216770..296cb14dc51 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -1,5 +1,8 @@ package datadog.trace.common.writer; +import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.monitor.Monitoring; import datadog.communication.monitor.Recording; import datadog.communication.serialization.ByteBufferConsumer; @@ -57,6 +60,16 @@ public Collection getApis() { @Override public void onDroppedTrace(int spanCount) { + // Antithesis: Assert that traces should not be dropped before sending + ObjectNode dropDetails = JsonNodeFactory.instance.objectNode(); + dropDetails.put("span_count", spanCount); + dropDetails.put("total_dropped_traces", droppedTraceCount.sum() + 1); + dropDetails.put("total_dropped_spans", droppedSpanCount.sum() + spanCount); + + Assert.unreachable( + "Traces should not be dropped before attempting to send - indicates buffer overflow or backpressure", + dropDetails); + droppedSpanCount.add(spanCount); droppedTraceCount.increment(); } @@ -103,18 +116,60 @@ public void accept(int messageCount, ByteBuffer buffer) { // the packer calls this when the buffer is full, // or when the packer is flushed at a heartbeat if (messageCount > 0) { + // Antithesis: Verify that we're attempting to send traces + Assert.reachable("Trace sending code path is exercised", null); + Assert.sometimes( + messageCount > 0, + "Traces are being sent to the API", + null); + batchTimer.reset(); Payload payload = newPayload(messageCount, buffer); final int sizeInBytes = payload.sizeInBytes(); healthMetrics.onSerialize(sizeInBytes); RemoteApi.Response response = api.sendSerializedTraces(payload); mapper.reset(); + + // Antithesis: Assert that trace sending should always succeed + ObjectNode sendDetails = JsonNodeFactory.instance.objectNode(); + sendDetails.put("trace_count", messageCount); + sendDetails.put("payload_size_bytes", sizeInBytes); + sendDetails.put("success", response.success()); + if (response.exception() != null) { + sendDetails.put("exception", response.exception().getClass().getName()); + sendDetails.put("exception_message", response.exception().getMessage()); + } + if (response.status() != null) { + sendDetails.put("http_status", response.status()); + } + + Assert.always( + response.success(), + "Trace sending to API should always succeed - no traces should be lost", + sendDetails); + if (response.success()) { if (log.isDebugEnabled()) { log.debug("Successfully sent {} traces to the API", messageCount); } healthMetrics.onSend(messageCount, sizeInBytes, response); } else { + // Antithesis: This code path should be unreachable if traces are never lost + ObjectNode failureDetails = JsonNodeFactory.instance.objectNode(); + failureDetails.put("trace_count", messageCount); + failureDetails.put("payload_size_bytes", sizeInBytes); + if (response.exception() != null) { + failureDetails.put("exception", response.exception().getClass().getName()); + failureDetails.put("exception_message", response.exception().getMessage()); + } + if (response.status() != null) { + failureDetails.put("http_status", response.status()); + } + + Assert.unreachable( + "Trace sending failure path should never be reached - indicates traces are being lost", + failureDetails); + if (log.isDebugEnabled()) { log.debug( "Failed to send {} traces of size {} bytes to the API", messageCount, sizeInBytes); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java index 90008cad0a0..6528ccfe953 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java @@ -3,6 +3,9 @@ import static datadog.trace.api.sampling.PrioritySampling.UNSET; import static java.util.concurrent.TimeUnit.MINUTES; +import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.trace.core.DDSpan; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.relocate.api.RatelimitedLogger; @@ -67,9 +70,32 @@ protected RemoteWriter( @Override public void write(final List trace) { + // Antithesis: Assert that we should never attempt to write when writer is closed + ObjectNode writeAttemptDetails = JsonNodeFactory.instance.objectNode(); + writeAttemptDetails.put("writer_closed", closed); + writeAttemptDetails.put("trace_size", trace.size()); + writeAttemptDetails.put("has_traces", !trace.isEmpty()); + + Assert.always( + !closed, + "Writer should never be closed when attempting to write traces", + writeAttemptDetails); + if (closed) { // We can't add events after shutdown otherwise it will never complete shutting down. log.debug("Dropped due to shutdown: {}", trace); + + // Antithesis: Track when traces are dropped due to writer being closed + ObjectNode shutdownDetails = JsonNodeFactory.instance.objectNode(); + shutdownDetails.put("trace_size", trace.size()); + shutdownDetails.put("span_count", trace.stream().mapToInt(List::size).sum()); + shutdownDetails.put("reason", "writer_closed_during_shutdown"); + + Assert.sometimes( + closed && !trace.isEmpty(), + "Traces are dropped due to writer shutdown - tracking shutdown behavior", + shutdownDetails); + handleDroppedTrace(trace); } else { if (trace.isEmpty()) { @@ -91,6 +117,18 @@ public void write(final List trace) { handleDroppedTrace(trace); break; case DROPPED_BUFFER_OVERFLOW: + // Antithesis: Buffer overflow should NEVER happen - this indicates a serious problem + ObjectNode overflowDetails = JsonNodeFactory.instance.objectNode(); + overflowDetails.put("trace_size", trace.size()); + overflowDetails.put("span_count", trace.stream().mapToInt(List::size).sum()); + overflowDetails.put("sampling_priority", samplingPriority); + overflowDetails.put("buffer_capacity", traceProcessingWorker.getCapacity()); + overflowDetails.put("reason", "buffer_overflow_backpressure"); + + Assert.unreachable( + "Buffer overflow should never occur - traces are being dropped due to backpressure", + overflowDetails); + if (log.isDebugEnabled()) { log.debug("Dropped due to a buffer overflow: {}", trace); } else { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java index 645bbc4b9e9..e28f32a99bc 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java @@ -2,6 +2,9 @@ import static datadog.communication.http.OkHttpUtils.prepareRequest; +import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.squareup.moshi.JsonAdapter; import com.squareup.moshi.Moshi; import com.squareup.moshi.Types; @@ -89,11 +92,30 @@ public void addResponseListener(final RemoteResponseListener listener) { public Response sendSerializedTraces(final Payload payload) { final int sizeInBytes = payload.sizeInBytes(); + + // Antithesis: Track that agent API send is being exercised + Assert.reachable("DDAgentApi trace sending is exercised", null); + Assert.sometimes( + payload.traceCount() > 0, + "Traces are being sent through DDAgentApi", + null); + String tracesEndpoint = featuresDiscovery.getTraceEndpoint(); if (null == tracesEndpoint) { featuresDiscovery.discoverIfOutdated(); tracesEndpoint = featuresDiscovery.getTraceEndpoint(); if (null == tracesEndpoint) { + // Antithesis: Agent should always be detectable + ObjectNode agentDetectionDetails = JsonNodeFactory.instance.objectNode(); + agentDetectionDetails.put("trace_count", payload.traceCount()); + agentDetectionDetails.put("payload_size_bytes", sizeInBytes); + agentDetectionDetails.put("agent_url", agentUrl.toString()); + agentDetectionDetails.put("failure_reason", "agent_not_detected"); + + Assert.unreachable( + "Datadog agent should always be detected - agent communication failure", + agentDetectionDetails); + log.error("No datadog agent detected"); countAndLogFailedSend(payload.traceCount(), sizeInBytes, null, null); return Response.failed(404); @@ -122,7 +144,34 @@ public Response sendSerializedTraces(final Payload payload) { try (final Recording recording = sendPayloadTimer.start(); final okhttp3.Response response = httpClient.newCall(request).execute()) { handleAgentChange(response.header(DATADOG_AGENT_STATE)); + + // Antithesis: Track HTTP response status and assert success + ObjectNode httpResponseDetails = JsonNodeFactory.instance.objectNode(); + httpResponseDetails.put("trace_count", payload.traceCount()); + httpResponseDetails.put("payload_size_bytes", sizeInBytes); + httpResponseDetails.put("http_status", response.code()); + httpResponseDetails.put("http_message", response.message()); + httpResponseDetails.put("success", response.code() == 200); + httpResponseDetails.put("agent_url", tracesUrl.toString()); + + Assert.always( + response.code() == 200, + "HTTP response from Datadog agent should always be 200 - API communication failure", + httpResponseDetails); + if (response.code() != 200) { + // Antithesis: Mark non-200 path as unreachable + ObjectNode errorDetails = JsonNodeFactory.instance.objectNode(); + errorDetails.put("trace_count", payload.traceCount()); + errorDetails.put("payload_size_bytes", sizeInBytes); + errorDetails.put("http_status", response.code()); + errorDetails.put("http_message", response.message()); + errorDetails.put("failure_reason", "http_error_response"); + + Assert.unreachable( + "Non-200 HTTP response from agent indicates API failure - traces may be lost", + errorDetails); + agentErrorCounter.incrementErrorCount(response.message(), payload.traceCount()); countAndLogFailedSend(payload.traceCount(), sizeInBytes, response, null); return Response.failed(response.code()); @@ -146,6 +195,19 @@ public Response sendSerializedTraces(final Payload payload) { } } } catch (final IOException e) { + // Antithesis: Network failures should not occur + ObjectNode networkErrorDetails = JsonNodeFactory.instance.objectNode(); + networkErrorDetails.put("trace_count", payload.traceCount()); + networkErrorDetails.put("payload_size_bytes", sizeInBytes); + networkErrorDetails.put("exception_type", e.getClass().getName()); + networkErrorDetails.put("exception_message", e.getMessage()); + networkErrorDetails.put("agent_url", agentUrl.toString()); + networkErrorDetails.put("failure_reason", "network_io_exception"); + + Assert.unreachable( + "Network/IO exceptions should not occur when sending to agent - indicates connectivity issues", + networkErrorDetails); + countAndLogFailedSend(payload.traceCount(), sizeInBytes, null, e); return Response.failed(e); } diff --git a/telemetry/build.gradle.kts b/telemetry/build.gradle.kts index 1b66facc063..76b8f303f9b 100644 --- a/telemetry/build.gradle.kts +++ b/telemetry/build.gradle.kts @@ -34,6 +34,10 @@ dependencies { implementation(libs.slf4j) implementation(project(":internal-api")) + + // Antithesis SDK for assertions and property testing + implementation(group = "com.antithesis", name = "antithesis-sdk-java", version = "0.1.5") + implementation(group = "com.fasterxml.jackson.core", name = "jackson-databind", version = "2.15.2") compileOnly(project(":dd-java-agent:agent-tooling")) testImplementation(project(":dd-java-agent:agent-tooling")) diff --git a/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java b/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java index c13411e0e69..e4bd38f911f 100644 --- a/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java +++ b/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java @@ -1,5 +1,8 @@ package datadog.telemetry; +import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.http.HttpRetryPolicy; import datadog.communication.http.OkHttpUtils; import datadog.trace.api.Config; @@ -94,14 +97,50 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { Request httpRequest = httpRequestBuilder.build(); String requestType = httpRequest.header(DD_TELEMETRY_REQUEST_TYPE); + + // Antithesis: Track telemetry sending attempts + Assert.reachable("Telemetry sending is exercised", null); try (okhttp3.Response response = OkHttpUtils.sendWithRetries(okHttpClient, httpRetryPolicy, httpRequest)) { + + // Antithesis: Assert that all telemetry requests should succeed + ObjectNode telemetryResponseDetails = JsonNodeFactory.instance.objectNode(); + telemetryResponseDetails.put("request_type", requestType != null ? requestType : "unknown"); + telemetryResponseDetails.put("http_status", response.code()); + telemetryResponseDetails.put("http_message", response.message()); + telemetryResponseDetails.put("url", url.toString()); + telemetryResponseDetails.put("success", response.isSuccessful()); + if (response.code() == 404) { + // Antithesis: Track 404 - endpoint disabled scenario + ObjectNode notFoundDetails = JsonNodeFactory.instance.objectNode(); + notFoundDetails.put("request_type", requestType != null ? requestType : "unknown"); + notFoundDetails.put("url", url.toString()); + notFoundDetails.put("reason", "endpoint_disabled_404"); + + Assert.sometimes( + response.code() == 404, + "Telemetry endpoint returns 404 - endpoint may be disabled", + notFoundDetails); + log.debug("Telemetry endpoint is disabled, dropping {} message.", requestType); return Result.NOT_FOUND; } + if (!response.isSuccessful()) { + // Antithesis: Telemetry should not fail - data should be retried/buffered + ObjectNode failureDetails = JsonNodeFactory.instance.objectNode(); + failureDetails.put("request_type", requestType != null ? requestType : "unknown"); + failureDetails.put("http_status", response.code()); + failureDetails.put("http_message", response.message()); + failureDetails.put("url", url.toString()); + failureDetails.put("reason", "http_error_response"); + + Assert.unreachable( + "Telemetry HTTP request failed - telemetry data should not be dropped, should retry", + failureDetails); + log.debug( "Telemetry message {} failed with: {} {}.", requestType, @@ -109,11 +148,30 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { response.message()); return Result.FAILURE; } + + // Antithesis: Assert success + Assert.always( + response.isSuccessful(), + "Telemetry requests should always succeed - no telemetry data should be lost", + telemetryResponseDetails); + } catch (InterruptedIOException e) { log.debug("Telemetry message {} sending interrupted: {}.", requestType, e.toString()); return Result.INTERRUPTED; } catch (IOException e) { + // Antithesis: Network failures should not cause telemetry loss + ObjectNode ioErrorDetails = JsonNodeFactory.instance.objectNode(); + ioErrorDetails.put("request_type", requestType != null ? requestType : "unknown"); + ioErrorDetails.put("exception_type", e.getClass().getName()); + ioErrorDetails.put("exception_message", e.getMessage()); + ioErrorDetails.put("url", url.toString()); + ioErrorDetails.put("reason", "network_io_exception"); + + Assert.unreachable( + "Telemetry network/IO failure - telemetry data should not be dropped, should retry", + ioErrorDetails); + log.debug("Telemetry message {} failed with exception: {}.", requestType, e.toString()); return Result.FAILURE; } diff --git a/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java b/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java index 1636f865def..dfd4e35569c 100644 --- a/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java +++ b/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java @@ -1,5 +1,8 @@ package datadog.telemetry; +import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import javax.annotation.Nullable; import okhttp3.HttpUrl; @@ -41,8 +44,33 @@ public TelemetryClient.Result sendRequest(TelemetryRequest request) { // interrupted request is most likely due to telemetry system shutdown, // we do not want to log errors and reattempt in this case && result != TelemetryClient.Result.INTERRUPTED; + + // Antithesis: Track telemetry routing and failover behavior + ObjectNode routingDetails = JsonNodeFactory.instance.objectNode(); + routingDetails.put("result", result.toString()); + routingDetails.put("current_client", currentClient == agentClient ? "agent" : "intake"); + routingDetails.put("request_failed", requestFailed); + routingDetails.put("has_fallback", intakeClient != null); + routingDetails.put("url", currentClient.getUrl().toString()); + + Assert.always( + result == TelemetryClient.Result.SUCCESS || result == TelemetryClient.Result.INTERRUPTED, + "Telemetry routing should always succeed - failures indicate data loss without retry mechanism", + routingDetails); + if (currentClient == agentClient) { if (requestFailed) { + // Antithesis: Track agent telemetry failures + ObjectNode agentFailureDetails = JsonNodeFactory.instance.objectNode(); + agentFailureDetails.put("result", result.toString()); + agentFailureDetails.put("url", currentClient.getUrl().toString()); + agentFailureDetails.put("has_intake_fallback", intakeClient != null); + agentFailureDetails.put("reason", "agent_telemetry_failure"); + + Assert.unreachable( + "Agent telemetry endpoint failed - switching to intake but current request data is lost", + agentFailureDetails); + reportErrorOnce(currentClient.getUrl(), result); if (intakeClient != null) { log.info("Agent Telemetry endpoint failed. Telemetry will be sent to Intake."); @@ -52,6 +80,17 @@ public TelemetryClient.Result sendRequest(TelemetryRequest request) { } } else { if (requestFailed) { + // Antithesis: Track intake telemetry failures + ObjectNode intakeFailureDetails = JsonNodeFactory.instance.objectNode(); + intakeFailureDetails.put("result", result.toString()); + intakeFailureDetails.put("url", currentClient.getUrl().toString()); + intakeFailureDetails.put("will_fallback_to_agent", true); + intakeFailureDetails.put("reason", "intake_telemetry_failure"); + + Assert.unreachable( + "Intake telemetry endpoint failed - switching to agent but current request data is lost", + intakeFailureDetails); + reportErrorOnce(currentClient.getUrl(), result); } if ((agentSupportsTelemetryProxy && !useIntakeClientByDefault) || requestFailed) { From d7a5e5948cacbe66c34c492dadb41aed1041b88a Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Tue, 18 Nov 2025 10:58:45 +0100 Subject: [PATCH 02/15] Fix compilation? --- dd-trace-core/build.gradle | 6 ++++-- telemetry/build.gradle.kts | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/dd-trace-core/build.gradle b/dd-trace-core/build.gradle index 110a0f380e4..215946beb6c 100644 --- a/dd-trace-core/build.gradle +++ b/dd-trace-core/build.gradle @@ -80,8 +80,10 @@ dependencies { implementation group: 'com.google.re2j', name: 're2j', version: '1.7' - // Antithesis SDK for assertions and property testing - implementation group: 'com.antithesis', name: 'antithesis-sdk-java', version: '0.1.5' + // Antithesis SDK for assertions and property testing (optional - only used when available) + compileOnly group: 'com.antithesis', name: 'antithesis-sdk-java', version: '0.1.5' + // Jackson for Antithesis assertion details (if not already available transitively) + compileOnly group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2' compileOnly group: 'com.github.spotbugs', name: 'spotbugs-annotations', version: '4.2.0' diff --git a/telemetry/build.gradle.kts b/telemetry/build.gradle.kts index 76b8f303f9b..34f945757d2 100644 --- a/telemetry/build.gradle.kts +++ b/telemetry/build.gradle.kts @@ -35,9 +35,9 @@ dependencies { implementation(project(":internal-api")) - // Antithesis SDK for assertions and property testing - implementation(group = "com.antithesis", name = "antithesis-sdk-java", version = "0.1.5") - implementation(group = "com.fasterxml.jackson.core", name = "jackson-databind", version = "2.15.2") + // Antithesis SDK for assertions and property testing (optional - only used when available) + compileOnly(group = "com.antithesis", name = "antithesis-sdk-java", version = "0.1.5") + compileOnly(group = "com.fasterxml.jackson.core", name = "jackson-databind", version = "2.15.2") compileOnly(project(":dd-java-agent:agent-tooling")) testImplementation(project(":dd-java-agent:agent-tooling")) From 17a3dd7a420093ab154dabc56838ad7938b48e87 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Tue, 18 Nov 2025 11:08:24 +0100 Subject: [PATCH 03/15] fix compilation? --- dd-trace-core/build.gradle | 2 +- telemetry/build.gradle.kts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dd-trace-core/build.gradle b/dd-trace-core/build.gradle index 215946beb6c..2d17878939b 100644 --- a/dd-trace-core/build.gradle +++ b/dd-trace-core/build.gradle @@ -81,7 +81,7 @@ dependencies { implementation group: 'com.google.re2j', name: 're2j', version: '1.7' // Antithesis SDK for assertions and property testing (optional - only used when available) - compileOnly group: 'com.antithesis', name: 'antithesis-sdk-java', version: '0.1.5' + compileOnly group: 'com.antithesis', name: 'sdk', version: '1.4.5' // Jackson for Antithesis assertion details (if not already available transitively) compileOnly group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2' diff --git a/telemetry/build.gradle.kts b/telemetry/build.gradle.kts index 34f945757d2..13ac00810dc 100644 --- a/telemetry/build.gradle.kts +++ b/telemetry/build.gradle.kts @@ -36,7 +36,7 @@ dependencies { implementation(project(":internal-api")) // Antithesis SDK for assertions and property testing (optional - only used when available) - compileOnly(group = "com.antithesis", name = "antithesis-sdk-java", version = "0.1.5") + compileOnly(group = "com.antithesis", name = "sdk", version = "1.4.5") compileOnly(group = "com.fasterxml.jackson.core", name = "jackson-databind", version = "2.15.2") compileOnly(project(":dd-java-agent:agent-tooling")) From d8dcccb3214b8701c2a9703a1e75a29cdddbc2c8 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Tue, 18 Nov 2025 12:10:57 +0100 Subject: [PATCH 04/15] fix compilation 3 --- .../common/writer/PayloadDispatcherImpl.java | 24 ++++++++----------- .../trace/common/writer/RemoteWriter.java | 2 -- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index 296cb14dc51..40832deb243 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -135,13 +135,11 @@ public void accept(int messageCount, ByteBuffer buffer) { sendDetails.put("trace_count", messageCount); sendDetails.put("payload_size_bytes", sizeInBytes); sendDetails.put("success", response.success()); - if (response.exception() != null) { - sendDetails.put("exception", response.exception().getClass().getName()); - sendDetails.put("exception_message", response.exception().getMessage()); - } - if (response.status() != null) { - sendDetails.put("http_status", response.status()); - } + response.exception().ifPresent(ex -> { + sendDetails.put("exception", ex.getClass().getName()); + sendDetails.put("exception_message", ex.getMessage()); + }); + response.status().ifPresent(status -> sendDetails.put("http_status", status)); Assert.always( response.success(), @@ -158,13 +156,11 @@ public void accept(int messageCount, ByteBuffer buffer) { ObjectNode failureDetails = JsonNodeFactory.instance.objectNode(); failureDetails.put("trace_count", messageCount); failureDetails.put("payload_size_bytes", sizeInBytes); - if (response.exception() != null) { - failureDetails.put("exception", response.exception().getClass().getName()); - failureDetails.put("exception_message", response.exception().getMessage()); - } - if (response.status() != null) { - failureDetails.put("http_status", response.status()); - } + response.exception().ifPresent(ex -> { + failureDetails.put("exception", ex.getClass().getName()); + failureDetails.put("exception_message", ex.getMessage()); + }); + response.status().ifPresent(status -> failureDetails.put("http_status", status)); Assert.unreachable( "Trace sending failure path should never be reached - indicates traces are being lost", diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java index 6528ccfe953..b4bd70c2a2e 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java @@ -88,7 +88,6 @@ public void write(final List trace) { // Antithesis: Track when traces are dropped due to writer being closed ObjectNode shutdownDetails = JsonNodeFactory.instance.objectNode(); shutdownDetails.put("trace_size", trace.size()); - shutdownDetails.put("span_count", trace.stream().mapToInt(List::size).sum()); shutdownDetails.put("reason", "writer_closed_during_shutdown"); Assert.sometimes( @@ -120,7 +119,6 @@ public void write(final List trace) { // Antithesis: Buffer overflow should NEVER happen - this indicates a serious problem ObjectNode overflowDetails = JsonNodeFactory.instance.objectNode(); overflowDetails.put("trace_size", trace.size()); - overflowDetails.put("span_count", trace.stream().mapToInt(List::size).sum()); overflowDetails.put("sampling_priority", samplingPriority); overflowDetails.put("buffer_capacity", traceProcessingWorker.getCapacity()); overflowDetails.put("reason", "buffer_overflow_backpressure"); From 14f256834cb1f6b70ab3e68c61ed029b814d4c96 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Tue, 18 Nov 2025 14:36:33 +0100 Subject: [PATCH 05/15] build with libraries --- dd-trace-core/build.gradle | 6 ++---- telemetry/build.gradle.kts | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/dd-trace-core/build.gradle b/dd-trace-core/build.gradle index 2d17878939b..e8435598169 100644 --- a/dd-trace-core/build.gradle +++ b/dd-trace-core/build.gradle @@ -80,10 +80,8 @@ dependencies { implementation group: 'com.google.re2j', name: 're2j', version: '1.7' - // Antithesis SDK for assertions and property testing (optional - only used when available) - compileOnly group: 'com.antithesis', name: 'sdk', version: '1.4.5' - // Jackson for Antithesis assertion details (if not already available transitively) - compileOnly group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2' + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation group: 'com.antithesis', name: 'sdk', version: '1.4.5' compileOnly group: 'com.github.spotbugs', name: 'spotbugs-annotations', version: '4.2.0' diff --git a/telemetry/build.gradle.kts b/telemetry/build.gradle.kts index 13ac00810dc..77c95d3a48d 100644 --- a/telemetry/build.gradle.kts +++ b/telemetry/build.gradle.kts @@ -35,9 +35,8 @@ dependencies { implementation(project(":internal-api")) - // Antithesis SDK for assertions and property testing (optional - only used when available) - compileOnly(group = "com.antithesis", name = "sdk", version = "1.4.5") - compileOnly(group = "com.fasterxml.jackson.core", name = "jackson-databind", version = "2.15.2") + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation(group = "com.antithesis", name = "sdk", version = "1.4.5") compileOnly(project(":dd-java-agent:agent-tooling")) testImplementation(project(":dd-java-agent:agent-tooling")) From e750f61cfc3f655bb152191a167d39b733821e5a Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Thu, 20 Nov 2025 10:31:03 +0100 Subject: [PATCH 06/15] Java Antithesis log before assertion --- .../datadog/trace/common/writer/PayloadDispatcherImpl.java | 5 +++++ .../main/java/datadog/trace/common/writer/RemoteWriter.java | 3 +++ .../datadog/trace/common/writer/ddagent/DDAgentApi.java | 6 ++++++ .../src/main/java/datadog/telemetry/TelemetryClient.java | 5 +++++ .../src/main/java/datadog/telemetry/TelemetryRouter.java | 3 +++ 5 files changed, 22 insertions(+) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index 40832deb243..a1edf9c236b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -66,6 +66,7 @@ public void onDroppedTrace(int spanCount) { dropDetails.put("total_dropped_traces", droppedTraceCount.sum() + 1); dropDetails.put("total_dropped_spans", droppedSpanCount.sum() + spanCount); + log.debug("ANTITHESIS_ASSERT: Traces dropped before sending (unreachable) - span_count: {}, total_dropped: {}", spanCount, droppedTraceCount.sum() + 1); Assert.unreachable( "Traces should not be dropped before attempting to send - indicates buffer overflow or backpressure", dropDetails); @@ -117,7 +118,9 @@ public void accept(int messageCount, ByteBuffer buffer) { // or when the packer is flushed at a heartbeat if (messageCount > 0) { // Antithesis: Verify that we're attempting to send traces + log.debug("ANTITHESIS_ASSERT: Trace sending code path exercised (reachable) - message_count: {}", messageCount); Assert.reachable("Trace sending code path is exercised", null); + log.debug("ANTITHESIS_ASSERT: Checking if traces are being sent to API (sometimes) - message_count: {}", messageCount); Assert.sometimes( messageCount > 0, "Traces are being sent to the API", @@ -141,6 +144,7 @@ public void accept(int messageCount, ByteBuffer buffer) { }); response.status().ifPresent(status -> sendDetails.put("http_status", status)); + log.debug("ANTITHESIS_ASSERT: Checking trace sending success (always) - success: {}, trace_count: {}", response.success(), messageCount); Assert.always( response.success(), "Trace sending to API should always succeed - no traces should be lost", @@ -162,6 +166,7 @@ public void accept(int messageCount, ByteBuffer buffer) { }); response.status().ifPresent(status -> failureDetails.put("http_status", status)); + log.debug("ANTITHESIS_ASSERT: Trace sending failed (unreachable) - trace_count: {}, size: {} bytes", messageCount, sizeInBytes); Assert.unreachable( "Trace sending failure path should never be reached - indicates traces are being lost", failureDetails); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java index b4bd70c2a2e..fb7584fe48d 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java @@ -76,6 +76,7 @@ public void write(final List trace) { writeAttemptDetails.put("trace_size", trace.size()); writeAttemptDetails.put("has_traces", !trace.isEmpty()); + log.debug("ANTITHESIS_ASSERT: Checking writer not closed when writing (always) - closed: {}, trace_size: {}", closed, trace.size()); Assert.always( !closed, "Writer should never be closed when attempting to write traces", @@ -90,6 +91,7 @@ public void write(final List trace) { shutdownDetails.put("trace_size", trace.size()); shutdownDetails.put("reason", "writer_closed_during_shutdown"); + log.debug("ANTITHESIS_ASSERT: Traces dropped due to shutdown (sometimes) - closed: {}, trace_size: {}", closed, trace.size()); Assert.sometimes( closed && !trace.isEmpty(), "Traces are dropped due to writer shutdown - tracking shutdown behavior", @@ -123,6 +125,7 @@ public void write(final List trace) { overflowDetails.put("buffer_capacity", traceProcessingWorker.getCapacity()); overflowDetails.put("reason", "buffer_overflow_backpressure"); + log.debug("ANTITHESIS_ASSERT: Buffer overflow occurred (unreachable) - trace_size: {}, capacity: {}", trace.size(), traceProcessingWorker.getCapacity()); Assert.unreachable( "Buffer overflow should never occur - traces are being dropped due to backpressure", overflowDetails); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java index e28f32a99bc..3cf099b59d3 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java @@ -94,7 +94,9 @@ public Response sendSerializedTraces(final Payload payload) { final int sizeInBytes = payload.sizeInBytes(); // Antithesis: Track that agent API send is being exercised + log.debug("ANTITHESIS_ASSERT: Verifying DDAgentApi trace sending is exercised (reachable) with {} traces", payload.traceCount()); Assert.reachable("DDAgentApi trace sending is exercised", null); + log.debug("ANTITHESIS_ASSERT: Checking if traces are being sent through DDAgentApi (sometimes) - count: {}", payload.traceCount()); Assert.sometimes( payload.traceCount() > 0, "Traces are being sent through DDAgentApi", @@ -112,6 +114,7 @@ public Response sendSerializedTraces(final Payload payload) { agentDetectionDetails.put("agent_url", agentUrl.toString()); agentDetectionDetails.put("failure_reason", "agent_not_detected"); + log.debug("ANTITHESIS_ASSERT: Agent not detected (unreachable) - url: {}, traces: {}", agentUrl, payload.traceCount()); Assert.unreachable( "Datadog agent should always be detected - agent communication failure", agentDetectionDetails); @@ -154,6 +157,7 @@ public Response sendSerializedTraces(final Payload payload) { httpResponseDetails.put("success", response.code() == 200); httpResponseDetails.put("agent_url", tracesUrl.toString()); + log.debug("ANTITHESIS_ASSERT: Checking HTTP response status (always) - code: {}, traces: {}", response.code(), payload.traceCount()); Assert.always( response.code() == 200, "HTTP response from Datadog agent should always be 200 - API communication failure", @@ -168,6 +172,7 @@ public Response sendSerializedTraces(final Payload payload) { errorDetails.put("http_message", response.message()); errorDetails.put("failure_reason", "http_error_response"); + log.debug("ANTITHESIS_ASSERT: Non-200 HTTP response (unreachable) - code: {}, message: {}, traces: {}", response.code(), response.message(), payload.traceCount()); Assert.unreachable( "Non-200 HTTP response from agent indicates API failure - traces may be lost", errorDetails); @@ -204,6 +209,7 @@ public Response sendSerializedTraces(final Payload payload) { networkErrorDetails.put("agent_url", agentUrl.toString()); networkErrorDetails.put("failure_reason", "network_io_exception"); + log.debug("ANTITHESIS_ASSERT: Network/IO exception (unreachable) - type: {}, message: {}, traces: {}", e.getClass().getName(), e.getMessage(), payload.traceCount()); Assert.unreachable( "Network/IO exceptions should not occur when sending to agent - indicates connectivity issues", networkErrorDetails); diff --git a/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java b/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java index e4bd38f911f..34dd8f44d79 100644 --- a/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java +++ b/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java @@ -99,6 +99,7 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { String requestType = httpRequest.header(DD_TELEMETRY_REQUEST_TYPE); // Antithesis: Track telemetry sending attempts + log.debug("ANTITHESIS_ASSERT: Telemetry sending exercised (reachable) - request_type: {}", requestType); Assert.reachable("Telemetry sending is exercised", null); try (okhttp3.Response response = @@ -119,6 +120,7 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { notFoundDetails.put("url", url.toString()); notFoundDetails.put("reason", "endpoint_disabled_404"); + log.debug("ANTITHESIS_ASSERT: Telemetry endpoint 404 (sometimes) - request_type: {}, url: {}", requestType, url); Assert.sometimes( response.code() == 404, "Telemetry endpoint returns 404 - endpoint may be disabled", @@ -137,6 +139,7 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { failureDetails.put("url", url.toString()); failureDetails.put("reason", "http_error_response"); + log.debug("ANTITHESIS_ASSERT: Telemetry HTTP request failed (unreachable) - request_type: {}, status: {}", requestType, response.code()); Assert.unreachable( "Telemetry HTTP request failed - telemetry data should not be dropped, should retry", failureDetails); @@ -150,6 +153,7 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { } // Antithesis: Assert success + log.debug("ANTITHESIS_ASSERT: Checking telemetry request success (always) - successful: {}, request_type: {}", response.isSuccessful(), requestType); Assert.always( response.isSuccessful(), "Telemetry requests should always succeed - no telemetry data should be lost", @@ -168,6 +172,7 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { ioErrorDetails.put("url", url.toString()); ioErrorDetails.put("reason", "network_io_exception"); + log.debug("ANTITHESIS_ASSERT: Telemetry network/IO exception (unreachable) - request_type: {}, exception: {}", requestType, e.getClass().getName()); Assert.unreachable( "Telemetry network/IO failure - telemetry data should not be dropped, should retry", ioErrorDetails); diff --git a/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java b/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java index dfd4e35569c..0d4d7f86f52 100644 --- a/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java +++ b/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java @@ -53,6 +53,7 @@ public TelemetryClient.Result sendRequest(TelemetryRequest request) { routingDetails.put("has_fallback", intakeClient != null); routingDetails.put("url", currentClient.getUrl().toString()); + log.debug("ANTITHESIS_ASSERT: Checking telemetry routing success (always) - result: {}, client: {}", result, currentClient == agentClient ? "agent" : "intake"); Assert.always( result == TelemetryClient.Result.SUCCESS || result == TelemetryClient.Result.INTERRUPTED, "Telemetry routing should always succeed - failures indicate data loss without retry mechanism", @@ -67,6 +68,7 @@ public TelemetryClient.Result sendRequest(TelemetryRequest request) { agentFailureDetails.put("has_intake_fallback", intakeClient != null); agentFailureDetails.put("reason", "agent_telemetry_failure"); + log.debug("ANTITHESIS_ASSERT: Agent telemetry endpoint failed (unreachable) - result: {}, has_fallback: {}", result, intakeClient != null); Assert.unreachable( "Agent telemetry endpoint failed - switching to intake but current request data is lost", agentFailureDetails); @@ -87,6 +89,7 @@ public TelemetryClient.Result sendRequest(TelemetryRequest request) { intakeFailureDetails.put("will_fallback_to_agent", true); intakeFailureDetails.put("reason", "intake_telemetry_failure"); + log.debug("ANTITHESIS_ASSERT: Intake telemetry endpoint failed (unreachable) - result: {}, will_fallback: true", result); Assert.unreachable( "Intake telemetry endpoint failed - switching to agent but current request data is lost", intakeFailureDetails); From 83ce6693403102100bb3af692ffe56b5d18ecaaa Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 26 Nov 2025 17:32:42 +0100 Subject: [PATCH 07/15] Remove old assertions and include new ones --- .../debugger/agent/ConfigurationUpdater.java | 9 +++ .../controller/openjdk/OpenJdkController.java | 3 + .../profiling/controller/ProfilingSystem.java | 12 +++- .../profiling/ddprof/DatadogProfiler.java | 14 +++- .../profiling/uploader/ProfileUploader.java | 5 ++ .../profiling/agent/ProfilingAgent.java | 7 ++ .../common/writer/PayloadDispatcherImpl.java | 55 --------------- .../trace/common/writer/RemoteWriter.java | 39 ----------- .../common/writer/ddagent/DDAgentApi.java | 68 ------------------- .../DefaultConfigurationPoller.java | 11 ++- .../datadog/telemetry/TelemetryClient.java | 60 ---------------- .../datadog/telemetry/TelemetryRouter.java | 41 ----------- .../dependency/DependencyResolver.java | 15 ++++ 13 files changed, 73 insertions(+), 266 deletions(-) diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java index 755c15ea667..04572ac50dc 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java @@ -34,6 +34,7 @@ import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.antithesis.sdk.Assert; /** * Handles configuration updates if required by installing a new ClassFileTransformer and triggering @@ -95,6 +96,8 @@ public void accept(Source source, Collection definiti applyNewConfiguration(newConfiguration); } catch (RuntimeException e) { ExceptionHelper.logException(LOGGER, e, "Error during accepting new debugger configuration:"); + LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.accept should sometimes throw a runtime exception (sometimes)"); + Assert.sometimes("ConfigurationUpdater.accept should sometimes throw a runtime exception"); throw e; } } @@ -143,9 +146,15 @@ private void applyNewConfiguration(Configuration newConfiguration) { currentConfiguration = newConfiguration; if (changes.hasProbeRelatedChanges()) { LOGGER.debug("Applying new probe configuration, changes: {}", changes); + LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.handleProbesChanges should sometimes be called (sometimes)"); + Assert.sometimes("ConfigurationUpdater.handleProbesChanges should sometimes be called"); handleProbesChanges(changes, newConfiguration); } + LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.applyNewConfiguration should always be successful (always)"); + Assert.always("ConfigurationUpdater.applyNewConfiguration should always be successful"); } finally { + LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.applyNewConfiguration should always be reachable (reachable)"); + Assert.reachable("ConfigurationUpdater.applyNewConfiguration should always be reachable"); configurationLock.unlock(); } } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java index b8e775d4fb1..d33d0e10438 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java @@ -50,6 +50,7 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.antithesis.sdk.Assert; /** * This is the implementation of the controller for OpenJDK. It should work for JDK 11+ today, and @@ -289,6 +290,8 @@ private static String getJfrRepositoryBase(ConfigProvider configProvider) { Files.createDirectories(repositoryPath); } catch (IOException e) { log.error("Failed to create JFR repository directory: {}", repositoryPath, e); + log.debug("ANTITHESIS_ASSERT: Failed to create JFR repository directory (unreachable)"); + Assert.unreachable("Failed to create JFR repository directory"); throw new IllegalStateException( "Failed to create JFR repository directory: " + repositoryPath, e); } diff --git a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java index 7f57b356d99..46f29c9c9d7 100644 --- a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java +++ b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java @@ -35,6 +35,7 @@ import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.antithesis.sdk.Assert; /** Sets up the profiling strategy and schedules the profiling recordings. */ public final class ProfilingSystem { @@ -196,9 +197,12 @@ private void startProfilingRecording() { if (t != null) { if (t instanceof IllegalStateException && "Shutdown in progress".equals(t.getMessage())) { ProfilerFlareLogger.getInstance().log("Shutdown in progress, cannot start profiling"); + log.debug("ANTITHESIS_ASSERT: Shutdown in progress, cannot start profiling (sometimes)"); + Assert.sometimes("Shutdown in progress, cannot start profiling"); } else { ProfilerFlareLogger.getInstance().log("Failed to start profiling", t); - + log.debug("ANTITHESIS_ASSERT: Failed to start profiling (unreachable)", t); + Assert.unreachable("Failed to start profiling"); throw t instanceof RuntimeException ? (RuntimeException) t : new RuntimeException(t); } } @@ -275,6 +279,8 @@ public void snapshot(boolean onShutdown) { // the last recording end time plus one nano second. The reason for this is that when // JFR is filtering the stream it will only discard earlier chunks that have an end // time that is before (not before or equal to) the requested start time of the filter. + log.debug("ANTITHESIS_ASSERT: Snapshot created (always) - lastSnapshot != null: {}", (lastSnapshot != null)); + Assert.always(lastSnapshot != null, "Snapshot created"); lastSnapshot = recordingData.getEnd().plus(ONE_NANO); dataListener.onNewData(recordingType, recordingData, onShutdown); } else { @@ -282,6 +288,8 @@ public void snapshot(boolean onShutdown) { } } catch (final Exception e) { log.error(SEND_TELEMETRY, "Exception in profiling thread, continuing", e); + log.debug("ANTITHESIS_ASSERT: Exception in profiling thread, continuing (unreachable)", e); + Assert.unreachable("Exception in profiling thread, continuing"); } catch (final Throwable t) { /* Try to continue even after fatal exception. It seems to be useful to attempt to store profile when this happens. @@ -294,6 +302,8 @@ public void snapshot(boolean onShutdown) { } catch (final Throwable t2) { // This should almost never happen and there is not much we can do here in cases like // OutOfMemoryError, so we will just ignore this. + log.debug("ANTITHESIS_ASSERT: Fatal exception in profiling thread, trying to continue (unreachable)"); + Assert.unreachable("Fatal exception in profiling thread, trying to continue"); } } } diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java index 0c889108d2d..45d693fee48 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java +++ b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java @@ -56,7 +56,7 @@ import javax.annotation.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - +import com.antithesis.sdk.Assert; /** * It is currently assumed that this class can be initialised early so that Datadog profiler's * thread filter captures all tracing activity, which means it must not be modified to depend on @@ -189,6 +189,8 @@ public OngoingRecording start() { return new DatadogProfilerRecording(this); } catch (IOException | IllegalStateException e) { log.debug("Failed to start Datadog profiler recording", e); + log.debug("ANTITHESIS_ASSERT: Failed to start Datadog profiler recording (unreachable)"); + Assert.unreachable("Failed to start Datadog profiler recording"); return null; } } @@ -203,12 +205,16 @@ public RecordingData stop(OngoingRecording recording) { void stopProfiler() { if (recordingFlag.compareAndSet(true, false)) { profiler.stop(); + log.debug("ANTITHESIS_ASSERT: Checking if profiling is still active after stop (sometimes) - active: {}", isActive()); + Assert.sometimes(isActive(),"Profiling is still active. Waiting to stop."); if (isActive()) { log.debug("Profiling is still active. Waiting to stop."); while (isActive()) { LockSupport.parkNanos(10_000_000L); } } + log.debug("ANTITHESIS_ASSERT: Profiling should be stopped (always) - active: {}", isActive()); + Assert.always(!isActive(),"Profiling is stopped"); } } @@ -222,6 +228,8 @@ public boolean isActive() { log.debug("Datadog Profiler Status = {}", status); return !status.contains("not active"); } catch (IOException ignored) { + log.debug("ANTITHESIS_ASSERT: Failed to get Datadog profiler status (unreachable)"); + Assert.unreachable("Failed to get Datadog profiler status"); } return false; } @@ -244,10 +252,14 @@ Path newRecording() throws IOException, IllegalStateException { log.warn("Unable to start Datadog profiler recording: {}", e.getMessage()); } recordingFlag.set(false); + log.debug("ANTITHESIS_ASSERT: Unable to start Datadog profiler recording (unreachable)"); + Assert.unreachable("Unable to start Datadog profiler recording"); throw e; } return recFile; } + log.debug("ANTITHESIS_ASSERT: Datadog profiler session has already been started (unreachable)"); + Assert.unreachable("Datadog profiler session has already been started"); throw new IllegalStateException("Datadog profiler session has already been started"); } diff --git a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java index ab588da6e1a..26dc07e0581 100644 --- a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java +++ b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java @@ -69,6 +69,7 @@ import okhttp3.ResponseBody; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.antithesis.sdk.Assert; /** The class for uploading profiles to the backend. */ public final class ProfileUploader { @@ -301,6 +302,8 @@ public void onFailure(final Call call, final IOException e) { // But, in any case, we have this safety-break in place to prevent blocking finishing the // sync request to a misbehaving server. if (handled.compareAndSet(false, true)) { + log.debug("ANTITHESIS_ASSERT: Upload timeout (unreachable)"); + Assert.unreachable("Upload timeout"); handleFailure(call, null, data, onCompletion); } } @@ -351,6 +354,8 @@ private void handleResponse( "Failed to upload profile, it's too big. Dumping information about the profile"); JfrCliHelper.invokeOn(data, ioLogger); } else { + log.debug("ANTITHESIS_ASSERT: Failed to upload profile (unreachable) - response code: {}", response.code()); + Assert.unreachable("Failed to upload profile"); ioLogger.error("Failed to upload profile", getLoggerResponse(response)); } } diff --git a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java index c73b618edb8..19a1c26323f 100644 --- a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java +++ b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java @@ -37,6 +37,7 @@ import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.antithesis.sdk.Assert; /** Profiling agent implementation */ public class ProfilingAgent { @@ -81,6 +82,8 @@ public void onNewData(RecordingType type, RecordingData data, boolean handleSync log.debug("Debug profile stored as {}", tmp); } catch (IOException e) { log.debug("Unable to write debug profile dump", e); + log.debug("ANTITHESIS_ASSERT: Unable to write debug profile dump (unreachable)"); + Assert.unreachable("Unable to write debug profile dump"); } } } @@ -169,11 +172,15 @@ public static synchronized boolean run(final boolean earlyStart, Instrumentation This means that if/when we implement functionality to manually shutdown profiler we would need to not forget to add code that removes this shutdown hook from JVM. */ + log.debug("ANTITHESIS_ASSERT: Shutdown hook added (always) - uploader != null: {}", (uploader != null)); + Assert.always(uploader!= null, "Shutdown hook added"); Runtime.getRuntime().addShutdownHook(new ShutdownHook(profiler, uploader)); } catch (final IllegalStateException ex) { // The JVM is already shutting down. } } catch (final UnsupportedEnvironmentException | ConfigurationException e) { + log.debug("ANTITHESIS_ASSERT: Failed to initialize profiling agent (unreachable)", e); + Assert.unreachable("Failed to initialize profiling agent!"); ProfilerFlareLogger.getInstance().log("Failed to initialize profiling agent!", e); ProfilerFlareReporter.reportInitializationException(e); } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index a1edf9c236b..542d73f7cf0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -1,8 +1,5 @@ package datadog.trace.common.writer; -import com.antithesis.sdk.Assert; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.monitor.Monitoring; import datadog.communication.monitor.Recording; import datadog.communication.serialization.ByteBufferConsumer; @@ -60,17 +57,6 @@ public Collection getApis() { @Override public void onDroppedTrace(int spanCount) { - // Antithesis: Assert that traces should not be dropped before sending - ObjectNode dropDetails = JsonNodeFactory.instance.objectNode(); - dropDetails.put("span_count", spanCount); - dropDetails.put("total_dropped_traces", droppedTraceCount.sum() + 1); - dropDetails.put("total_dropped_spans", droppedSpanCount.sum() + spanCount); - - log.debug("ANTITHESIS_ASSERT: Traces dropped before sending (unreachable) - span_count: {}, total_dropped: {}", spanCount, droppedTraceCount.sum() + 1); - Assert.unreachable( - "Traces should not be dropped before attempting to send - indicates buffer overflow or backpressure", - dropDetails); - droppedSpanCount.add(spanCount); droppedTraceCount.increment(); } @@ -117,15 +103,6 @@ public void accept(int messageCount, ByteBuffer buffer) { // the packer calls this when the buffer is full, // or when the packer is flushed at a heartbeat if (messageCount > 0) { - // Antithesis: Verify that we're attempting to send traces - log.debug("ANTITHESIS_ASSERT: Trace sending code path exercised (reachable) - message_count: {}", messageCount); - Assert.reachable("Trace sending code path is exercised", null); - log.debug("ANTITHESIS_ASSERT: Checking if traces are being sent to API (sometimes) - message_count: {}", messageCount); - Assert.sometimes( - messageCount > 0, - "Traces are being sent to the API", - null); - batchTimer.reset(); Payload payload = newPayload(messageCount, buffer); final int sizeInBytes = payload.sizeInBytes(); @@ -133,44 +110,12 @@ public void accept(int messageCount, ByteBuffer buffer) { RemoteApi.Response response = api.sendSerializedTraces(payload); mapper.reset(); - // Antithesis: Assert that trace sending should always succeed - ObjectNode sendDetails = JsonNodeFactory.instance.objectNode(); - sendDetails.put("trace_count", messageCount); - sendDetails.put("payload_size_bytes", sizeInBytes); - sendDetails.put("success", response.success()); - response.exception().ifPresent(ex -> { - sendDetails.put("exception", ex.getClass().getName()); - sendDetails.put("exception_message", ex.getMessage()); - }); - response.status().ifPresent(status -> sendDetails.put("http_status", status)); - - log.debug("ANTITHESIS_ASSERT: Checking trace sending success (always) - success: {}, trace_count: {}", response.success(), messageCount); - Assert.always( - response.success(), - "Trace sending to API should always succeed - no traces should be lost", - sendDetails); - if (response.success()) { if (log.isDebugEnabled()) { log.debug("Successfully sent {} traces to the API", messageCount); } healthMetrics.onSend(messageCount, sizeInBytes, response); } else { - // Antithesis: This code path should be unreachable if traces are never lost - ObjectNode failureDetails = JsonNodeFactory.instance.objectNode(); - failureDetails.put("trace_count", messageCount); - failureDetails.put("payload_size_bytes", sizeInBytes); - response.exception().ifPresent(ex -> { - failureDetails.put("exception", ex.getClass().getName()); - failureDetails.put("exception_message", ex.getMessage()); - }); - response.status().ifPresent(status -> failureDetails.put("http_status", status)); - - log.debug("ANTITHESIS_ASSERT: Trace sending failed (unreachable) - trace_count: {}, size: {} bytes", messageCount, sizeInBytes); - Assert.unreachable( - "Trace sending failure path should never be reached - indicates traces are being lost", - failureDetails); - if (log.isDebugEnabled()) { log.debug( "Failed to send {} traces of size {} bytes to the API", messageCount, sizeInBytes); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java index fb7584fe48d..90008cad0a0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java @@ -3,9 +3,6 @@ import static datadog.trace.api.sampling.PrioritySampling.UNSET; import static java.util.concurrent.TimeUnit.MINUTES; -import com.antithesis.sdk.Assert; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.trace.core.DDSpan; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.relocate.api.RatelimitedLogger; @@ -70,33 +67,9 @@ protected RemoteWriter( @Override public void write(final List trace) { - // Antithesis: Assert that we should never attempt to write when writer is closed - ObjectNode writeAttemptDetails = JsonNodeFactory.instance.objectNode(); - writeAttemptDetails.put("writer_closed", closed); - writeAttemptDetails.put("trace_size", trace.size()); - writeAttemptDetails.put("has_traces", !trace.isEmpty()); - - log.debug("ANTITHESIS_ASSERT: Checking writer not closed when writing (always) - closed: {}, trace_size: {}", closed, trace.size()); - Assert.always( - !closed, - "Writer should never be closed when attempting to write traces", - writeAttemptDetails); - if (closed) { // We can't add events after shutdown otherwise it will never complete shutting down. log.debug("Dropped due to shutdown: {}", trace); - - // Antithesis: Track when traces are dropped due to writer being closed - ObjectNode shutdownDetails = JsonNodeFactory.instance.objectNode(); - shutdownDetails.put("trace_size", trace.size()); - shutdownDetails.put("reason", "writer_closed_during_shutdown"); - - log.debug("ANTITHESIS_ASSERT: Traces dropped due to shutdown (sometimes) - closed: {}, trace_size: {}", closed, trace.size()); - Assert.sometimes( - closed && !trace.isEmpty(), - "Traces are dropped due to writer shutdown - tracking shutdown behavior", - shutdownDetails); - handleDroppedTrace(trace); } else { if (trace.isEmpty()) { @@ -118,18 +91,6 @@ public void write(final List trace) { handleDroppedTrace(trace); break; case DROPPED_BUFFER_OVERFLOW: - // Antithesis: Buffer overflow should NEVER happen - this indicates a serious problem - ObjectNode overflowDetails = JsonNodeFactory.instance.objectNode(); - overflowDetails.put("trace_size", trace.size()); - overflowDetails.put("sampling_priority", samplingPriority); - overflowDetails.put("buffer_capacity", traceProcessingWorker.getCapacity()); - overflowDetails.put("reason", "buffer_overflow_backpressure"); - - log.debug("ANTITHESIS_ASSERT: Buffer overflow occurred (unreachable) - trace_size: {}, capacity: {}", trace.size(), traceProcessingWorker.getCapacity()); - Assert.unreachable( - "Buffer overflow should never occur - traces are being dropped due to backpressure", - overflowDetails); - if (log.isDebugEnabled()) { log.debug("Dropped due to a buffer overflow: {}", trace); } else { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java index 3cf099b59d3..645bbc4b9e9 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/DDAgentApi.java @@ -2,9 +2,6 @@ import static datadog.communication.http.OkHttpUtils.prepareRequest; -import com.antithesis.sdk.Assert; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; import com.squareup.moshi.JsonAdapter; import com.squareup.moshi.Moshi; import com.squareup.moshi.Types; @@ -92,33 +89,11 @@ public void addResponseListener(final RemoteResponseListener listener) { public Response sendSerializedTraces(final Payload payload) { final int sizeInBytes = payload.sizeInBytes(); - - // Antithesis: Track that agent API send is being exercised - log.debug("ANTITHESIS_ASSERT: Verifying DDAgentApi trace sending is exercised (reachable) with {} traces", payload.traceCount()); - Assert.reachable("DDAgentApi trace sending is exercised", null); - log.debug("ANTITHESIS_ASSERT: Checking if traces are being sent through DDAgentApi (sometimes) - count: {}", payload.traceCount()); - Assert.sometimes( - payload.traceCount() > 0, - "Traces are being sent through DDAgentApi", - null); - String tracesEndpoint = featuresDiscovery.getTraceEndpoint(); if (null == tracesEndpoint) { featuresDiscovery.discoverIfOutdated(); tracesEndpoint = featuresDiscovery.getTraceEndpoint(); if (null == tracesEndpoint) { - // Antithesis: Agent should always be detectable - ObjectNode agentDetectionDetails = JsonNodeFactory.instance.objectNode(); - agentDetectionDetails.put("trace_count", payload.traceCount()); - agentDetectionDetails.put("payload_size_bytes", sizeInBytes); - agentDetectionDetails.put("agent_url", agentUrl.toString()); - agentDetectionDetails.put("failure_reason", "agent_not_detected"); - - log.debug("ANTITHESIS_ASSERT: Agent not detected (unreachable) - url: {}, traces: {}", agentUrl, payload.traceCount()); - Assert.unreachable( - "Datadog agent should always be detected - agent communication failure", - agentDetectionDetails); - log.error("No datadog agent detected"); countAndLogFailedSend(payload.traceCount(), sizeInBytes, null, null); return Response.failed(404); @@ -147,36 +122,7 @@ public Response sendSerializedTraces(final Payload payload) { try (final Recording recording = sendPayloadTimer.start(); final okhttp3.Response response = httpClient.newCall(request).execute()) { handleAgentChange(response.header(DATADOG_AGENT_STATE)); - - // Antithesis: Track HTTP response status and assert success - ObjectNode httpResponseDetails = JsonNodeFactory.instance.objectNode(); - httpResponseDetails.put("trace_count", payload.traceCount()); - httpResponseDetails.put("payload_size_bytes", sizeInBytes); - httpResponseDetails.put("http_status", response.code()); - httpResponseDetails.put("http_message", response.message()); - httpResponseDetails.put("success", response.code() == 200); - httpResponseDetails.put("agent_url", tracesUrl.toString()); - - log.debug("ANTITHESIS_ASSERT: Checking HTTP response status (always) - code: {}, traces: {}", response.code(), payload.traceCount()); - Assert.always( - response.code() == 200, - "HTTP response from Datadog agent should always be 200 - API communication failure", - httpResponseDetails); - if (response.code() != 200) { - // Antithesis: Mark non-200 path as unreachable - ObjectNode errorDetails = JsonNodeFactory.instance.objectNode(); - errorDetails.put("trace_count", payload.traceCount()); - errorDetails.put("payload_size_bytes", sizeInBytes); - errorDetails.put("http_status", response.code()); - errorDetails.put("http_message", response.message()); - errorDetails.put("failure_reason", "http_error_response"); - - log.debug("ANTITHESIS_ASSERT: Non-200 HTTP response (unreachable) - code: {}, message: {}, traces: {}", response.code(), response.message(), payload.traceCount()); - Assert.unreachable( - "Non-200 HTTP response from agent indicates API failure - traces may be lost", - errorDetails); - agentErrorCounter.incrementErrorCount(response.message(), payload.traceCount()); countAndLogFailedSend(payload.traceCount(), sizeInBytes, response, null); return Response.failed(response.code()); @@ -200,20 +146,6 @@ public Response sendSerializedTraces(final Payload payload) { } } } catch (final IOException e) { - // Antithesis: Network failures should not occur - ObjectNode networkErrorDetails = JsonNodeFactory.instance.objectNode(); - networkErrorDetails.put("trace_count", payload.traceCount()); - networkErrorDetails.put("payload_size_bytes", sizeInBytes); - networkErrorDetails.put("exception_type", e.getClass().getName()); - networkErrorDetails.put("exception_message", e.getMessage()); - networkErrorDetails.put("agent_url", agentUrl.toString()); - networkErrorDetails.put("failure_reason", "network_io_exception"); - - log.debug("ANTITHESIS_ASSERT: Network/IO exception (unreachable) - type: {}, message: {}, traces: {}", e.getClass().getName(), e.getMessage(), payload.traceCount()); - Assert.unreachable( - "Network/IO exceptions should not occur when sending to agent - indicates connectivity issues", - networkErrorDetails); - countAndLogFailedSend(payload.traceCount(), sizeInBytes, null, e); return Response.failed(e); } diff --git a/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java b/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java index 43863d1699b..6e21bb0c458 100644 --- a/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java +++ b/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java @@ -51,6 +51,7 @@ import okio.ByteString; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.antithesis.sdk.Assert; /** Handles polling debugger configuration from datadog agent/Remote Configuration */ public class DefaultConfigurationPoller @@ -281,8 +282,10 @@ private boolean initialize() { new PollerRequestFactory(config, tracerVersion, containerId, entityId, url, moshi); } catch (Exception e) { // We can't recover from this, so we'll not try to initialize again. - fatalOnInitialization = true; log.error("Remote configuration poller initialization failed", e); + log.debug("ANTITHESIS_ASSERT: Remote configuration poller initialization failed (unreachable)", e); + Assert.unreachable("Remote configuration poller initialization failed"); + fatalOnInitialization = true; } return true; } @@ -379,6 +382,8 @@ private void handleAgentResponse(ResponseBody body) { } catch (Exception e) { // no error can be reported, as we don't have the data client.state.targets_version avail ratelimitedLogger.warn("Error parsing remote config response", e); + log.debug("ANTITHESIS_ASSERT: Error parsing remote config response (unreachable)", e); + Assert.unreachable("Error parsing remote config response"); return; } @@ -446,6 +451,8 @@ private void runConfigurationEndListener( ConfigurationEndListener listener, List errors) { try { listener.onConfigurationEnd(); + log.debug("ANTITHESIS_ASSERT: Configuration end listener should always be reachable (reachable)"); + Assert.reachable("Configuration end listener should always be reachable"); } catch (ReportableException re) { errors.add(re); } catch (RuntimeException rte) { @@ -454,6 +461,8 @@ private void runConfigurationEndListener( // is about combining configuration from different products ratelimitedLogger.warn( "Error running configuration listener {}: {}", listener, rte.getMessage(), rte); + log.debug("ANTITHESIS_ASSERT: Error running configuration listener (unreachable)", rte); + Assert.unreachable("Error running configuration listener"); } } diff --git a/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java b/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java index 34dd8f44d79..c0f7f26bfe0 100644 --- a/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java +++ b/telemetry/src/main/java/datadog/telemetry/TelemetryClient.java @@ -1,8 +1,5 @@ package datadog.telemetry; -import com.antithesis.sdk.Assert; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.http.HttpRetryPolicy; import datadog.communication.http.OkHttpUtils; import datadog.trace.api.Config; @@ -97,53 +94,16 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { Request httpRequest = httpRequestBuilder.build(); String requestType = httpRequest.header(DD_TELEMETRY_REQUEST_TYPE); - - // Antithesis: Track telemetry sending attempts - log.debug("ANTITHESIS_ASSERT: Telemetry sending exercised (reachable) - request_type: {}", requestType); - Assert.reachable("Telemetry sending is exercised", null); try (okhttp3.Response response = OkHttpUtils.sendWithRetries(okHttpClient, httpRetryPolicy, httpRequest)) { - - // Antithesis: Assert that all telemetry requests should succeed - ObjectNode telemetryResponseDetails = JsonNodeFactory.instance.objectNode(); - telemetryResponseDetails.put("request_type", requestType != null ? requestType : "unknown"); - telemetryResponseDetails.put("http_status", response.code()); - telemetryResponseDetails.put("http_message", response.message()); - telemetryResponseDetails.put("url", url.toString()); - telemetryResponseDetails.put("success", response.isSuccessful()); - if (response.code() == 404) { - // Antithesis: Track 404 - endpoint disabled scenario - ObjectNode notFoundDetails = JsonNodeFactory.instance.objectNode(); - notFoundDetails.put("request_type", requestType != null ? requestType : "unknown"); - notFoundDetails.put("url", url.toString()); - notFoundDetails.put("reason", "endpoint_disabled_404"); - - log.debug("ANTITHESIS_ASSERT: Telemetry endpoint 404 (sometimes) - request_type: {}, url: {}", requestType, url); - Assert.sometimes( - response.code() == 404, - "Telemetry endpoint returns 404 - endpoint may be disabled", - notFoundDetails); log.debug("Telemetry endpoint is disabled, dropping {} message.", requestType); return Result.NOT_FOUND; } if (!response.isSuccessful()) { - // Antithesis: Telemetry should not fail - data should be retried/buffered - ObjectNode failureDetails = JsonNodeFactory.instance.objectNode(); - failureDetails.put("request_type", requestType != null ? requestType : "unknown"); - failureDetails.put("http_status", response.code()); - failureDetails.put("http_message", response.message()); - failureDetails.put("url", url.toString()); - failureDetails.put("reason", "http_error_response"); - - log.debug("ANTITHESIS_ASSERT: Telemetry HTTP request failed (unreachable) - request_type: {}, status: {}", requestType, response.code()); - Assert.unreachable( - "Telemetry HTTP request failed - telemetry data should not be dropped, should retry", - failureDetails); - log.debug( "Telemetry message {} failed with: {} {}.", requestType, @@ -151,32 +111,12 @@ public Result sendHttpRequest(Request.Builder httpRequestBuilder) { response.message()); return Result.FAILURE; } - - // Antithesis: Assert success - log.debug("ANTITHESIS_ASSERT: Checking telemetry request success (always) - successful: {}, request_type: {}", response.isSuccessful(), requestType); - Assert.always( - response.isSuccessful(), - "Telemetry requests should always succeed - no telemetry data should be lost", - telemetryResponseDetails); } catch (InterruptedIOException e) { log.debug("Telemetry message {} sending interrupted: {}.", requestType, e.toString()); return Result.INTERRUPTED; } catch (IOException e) { - // Antithesis: Network failures should not cause telemetry loss - ObjectNode ioErrorDetails = JsonNodeFactory.instance.objectNode(); - ioErrorDetails.put("request_type", requestType != null ? requestType : "unknown"); - ioErrorDetails.put("exception_type", e.getClass().getName()); - ioErrorDetails.put("exception_message", e.getMessage()); - ioErrorDetails.put("url", url.toString()); - ioErrorDetails.put("reason", "network_io_exception"); - - log.debug("ANTITHESIS_ASSERT: Telemetry network/IO exception (unreachable) - request_type: {}, exception: {}", requestType, e.getClass().getName()); - Assert.unreachable( - "Telemetry network/IO failure - telemetry data should not be dropped, should retry", - ioErrorDetails); - log.debug("Telemetry message {} failed with exception: {}.", requestType, e.toString()); return Result.FAILURE; } diff --git a/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java b/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java index 0d4d7f86f52..801d6743ae7 100644 --- a/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java +++ b/telemetry/src/main/java/datadog/telemetry/TelemetryRouter.java @@ -1,8 +1,5 @@ package datadog.telemetry; -import com.antithesis.sdk.Assert; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import javax.annotation.Nullable; import okhttp3.HttpUrl; @@ -45,34 +42,8 @@ public TelemetryClient.Result sendRequest(TelemetryRequest request) { // we do not want to log errors and reattempt in this case && result != TelemetryClient.Result.INTERRUPTED; - // Antithesis: Track telemetry routing and failover behavior - ObjectNode routingDetails = JsonNodeFactory.instance.objectNode(); - routingDetails.put("result", result.toString()); - routingDetails.put("current_client", currentClient == agentClient ? "agent" : "intake"); - routingDetails.put("request_failed", requestFailed); - routingDetails.put("has_fallback", intakeClient != null); - routingDetails.put("url", currentClient.getUrl().toString()); - - log.debug("ANTITHESIS_ASSERT: Checking telemetry routing success (always) - result: {}, client: {}", result, currentClient == agentClient ? "agent" : "intake"); - Assert.always( - result == TelemetryClient.Result.SUCCESS || result == TelemetryClient.Result.INTERRUPTED, - "Telemetry routing should always succeed - failures indicate data loss without retry mechanism", - routingDetails); - if (currentClient == agentClient) { if (requestFailed) { - // Antithesis: Track agent telemetry failures - ObjectNode agentFailureDetails = JsonNodeFactory.instance.objectNode(); - agentFailureDetails.put("result", result.toString()); - agentFailureDetails.put("url", currentClient.getUrl().toString()); - agentFailureDetails.put("has_intake_fallback", intakeClient != null); - agentFailureDetails.put("reason", "agent_telemetry_failure"); - - log.debug("ANTITHESIS_ASSERT: Agent telemetry endpoint failed (unreachable) - result: {}, has_fallback: {}", result, intakeClient != null); - Assert.unreachable( - "Agent telemetry endpoint failed - switching to intake but current request data is lost", - agentFailureDetails); - reportErrorOnce(currentClient.getUrl(), result); if (intakeClient != null) { log.info("Agent Telemetry endpoint failed. Telemetry will be sent to Intake."); @@ -82,18 +53,6 @@ public TelemetryClient.Result sendRequest(TelemetryRequest request) { } } else { if (requestFailed) { - // Antithesis: Track intake telemetry failures - ObjectNode intakeFailureDetails = JsonNodeFactory.instance.objectNode(); - intakeFailureDetails.put("result", result.toString()); - intakeFailureDetails.put("url", currentClient.getUrl().toString()); - intakeFailureDetails.put("will_fallback_to_agent", true); - intakeFailureDetails.put("reason", "intake_telemetry_failure"); - - log.debug("ANTITHESIS_ASSERT: Intake telemetry endpoint failed (unreachable) - result: {}, will_fallback: true", result); - Assert.unreachable( - "Intake telemetry endpoint failed - switching to agent but current request data is lost", - intakeFailureDetails); - reportErrorOnce(currentClient.getUrl(), result); } if ((agentSupportsTelemetryProxy && !useIntakeClientByDefault) || requestFailed) { diff --git a/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java b/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java index 91ec8c52162..1abd3d18067 100644 --- a/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java +++ b/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java @@ -1,5 +1,8 @@ package datadog.telemetry.dependency; +import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -41,6 +44,18 @@ static List internalResolve(final URI uri) throws IOException { return Collections.emptyList(); } if (metadata.isDirectory) { + // Antithesis: Track when dependencies are extracted from directories + ObjectNode directoryDetails = JsonNodeFactory.instance.objectNode(); + directoryDetails.put("uri", uri.toString()); + directoryDetails.put("scheme", scheme); + directoryDetails.put("is_directory", true); + + log.debug("ANTITHESIS_ASSERT: Directory dependency extraction attempted (sometimes) - uri: {}", uri); + Assert.sometimes( + metadata.isDirectory, + "Directory dependencies are encountered - tracking unsupported dependency type", + directoryDetails); + log.debug("Extracting dependencies from directories is not supported: {}", uri); return Collections.emptyList(); } From a56fe564bbd53d2fcc9fae52034ed54cb72aaabd Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 26 Nov 2025 17:46:47 +0100 Subject: [PATCH 08/15] fix compilation --- dd-java-agent/agent-debugger/build.gradle | 3 +++ dd-java-agent/agent-profiling/build.gradle | 3 +++ .../agent-profiling/profiling-controller-openjdk/build.gradle | 3 +++ .../agent-profiling/profiling-controller/build.gradle | 3 +++ dd-java-agent/agent-profiling/profiling-ddprof/build.gradle | 3 +++ dd-java-agent/agent-profiling/profiling-uploader/build.gradle | 3 +++ remote-config/remote-config-core/build.gradle.kts | 3 +++ 7 files changed, 21 insertions(+) diff --git a/dd-java-agent/agent-debugger/build.gradle b/dd-java-agent/agent-debugger/build.gradle index 0ef3992644f..4de4d80151c 100644 --- a/dd-java-agent/agent-debugger/build.gradle +++ b/dd-java-agent/agent-debugger/build.gradle @@ -44,6 +44,9 @@ dependencies { implementation libs.dogstatsd implementation libs.moshi + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation group: 'com.antithesis', name: 'sdk', version: '1.4.5' + testImplementation libs.asm.util testImplementation libs.bundles.junit5 testImplementation libs.junit.jupiter.params diff --git a/dd-java-agent/agent-profiling/build.gradle b/dd-java-agent/agent-profiling/build.gradle index a53ac40d8fe..f46d991de55 100644 --- a/dd-java-agent/agent-profiling/build.gradle +++ b/dd-java-agent/agent-profiling/build.gradle @@ -20,6 +20,9 @@ dependencies { api libs.slf4j api project(':internal-api') + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation group: 'com.antithesis', name: 'sdk', version: '1.4.5' + api project(':dd-java-agent:agent-profiling:profiling-ddprof') api project(':dd-java-agent:agent-profiling:profiling-uploader') api project(':dd-java-agent:agent-profiling:profiling-controller') diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/build.gradle b/dd-java-agent/agent-profiling/profiling-controller-openjdk/build.gradle index 61c9e9a57bf..1d797db2069 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/build.gradle +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/build.gradle @@ -26,6 +26,9 @@ dependencies { api project(':dd-java-agent:agent-profiling:profiling-controller') api project(':dd-java-agent:agent-profiling:profiling-controller-jfr') + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation group: 'com.antithesis', name: 'sdk', version: '1.4.5' + testImplementation libs.bundles.junit5 testImplementation libs.bundles.mockito testImplementation files(project(':dd-java-agent:agent-profiling:profiling-controller-jfr').sourceSets.test.output) diff --git a/dd-java-agent/agent-profiling/profiling-controller/build.gradle b/dd-java-agent/agent-profiling/profiling-controller/build.gradle index e255fdf668d..c5820188739 100644 --- a/dd-java-agent/agent-profiling/profiling-controller/build.gradle +++ b/dd-java-agent/agent-profiling/profiling-controller/build.gradle @@ -22,6 +22,9 @@ dependencies { api project(':components:environment') api project(':dd-java-agent:agent-profiling:profiling-utils') + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation group: 'com.antithesis', name: 'sdk', version: '1.4.5' + testImplementation libs.bundles.junit5 testImplementation libs.guava testImplementation libs.bundles.mockito diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/build.gradle b/dd-java-agent/agent-profiling/profiling-ddprof/build.gradle index 2664a8945ac..8350eda97f4 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/build.gradle +++ b/dd-java-agent/agent-profiling/profiling-ddprof/build.gradle @@ -36,6 +36,9 @@ dependencies { implementation libs.slf4j + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation group: 'com.antithesis', name: 'sdk', version: '1.4.5' + testImplementation libs.bundles.jmc testImplementation libs.bundles.junit5 } diff --git a/dd-java-agent/agent-profiling/profiling-uploader/build.gradle b/dd-java-agent/agent-profiling/profiling-uploader/build.gradle index f9a03e3a917..3672a3d5d1c 100644 --- a/dd-java-agent/agent-profiling/profiling-uploader/build.gradle +++ b/dd-java-agent/agent-profiling/profiling-uploader/build.gradle @@ -33,6 +33,9 @@ dependencies { implementation libs.lz4 implementation libs.aircompressor + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation group: 'com.antithesis', name: 'sdk', version: '1.4.5' + testImplementation project(':dd-java-agent:agent-profiling:profiling-testing') testImplementation project(':utils:test-utils') testImplementation libs.bundles.junit5 diff --git a/remote-config/remote-config-core/build.gradle.kts b/remote-config/remote-config-core/build.gradle.kts index f3d0200b797..3c90599ee00 100644 --- a/remote-config/remote-config-core/build.gradle.kts +++ b/remote-config/remote-config-core/build.gradle.kts @@ -37,6 +37,9 @@ dependencies { implementation(libs.moshi) implementation(libs.bundles.cafe.crypto) + // Antithesis SDK for assertions and property testing - bundled in tracer JAR + implementation(group = "com.antithesis", name = "sdk", version = "1.4.5") + implementation(project(":internal-api")) testImplementation(project(":utils:test-utils")) From 35bd8be91aafc3fa544082f0e5c84184c96b5881 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 26 Nov 2025 18:01:27 +0100 Subject: [PATCH 09/15] fix --- .../datadog/debugger/agent/ConfigurationUpdater.java | 8 ++++---- .../controller/openjdk/OpenJdkController.java | 2 +- .../profiling/controller/ProfilingSystem.java | 10 +++++----- .../datadog/profiling/ddprof/DatadogProfiler.java | 12 ++++++------ .../datadog/profiling/uploader/ProfileUploader.java | 4 ++-- .../com/datadog/profiling/agent/ProfilingAgent.java | 6 +++--- .../remoteconfig/DefaultConfigurationPoller.java | 8 ++++---- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java index 04572ac50dc..ab37c5c187c 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java @@ -97,7 +97,7 @@ public void accept(Source source, Collection definiti } catch (RuntimeException e) { ExceptionHelper.logException(LOGGER, e, "Error during accepting new debugger configuration:"); LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.accept should sometimes throw a runtime exception (sometimes)"); - Assert.sometimes("ConfigurationUpdater.accept should sometimes throw a runtime exception"); + Assert.sometimes(true, "ConfigurationUpdater.accept should sometimes throw a runtime exception", null); throw e; } } @@ -147,14 +147,14 @@ private void applyNewConfiguration(Configuration newConfiguration) { if (changes.hasProbeRelatedChanges()) { LOGGER.debug("Applying new probe configuration, changes: {}", changes); LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.handleProbesChanges should sometimes be called (sometimes)"); - Assert.sometimes("ConfigurationUpdater.handleProbesChanges should sometimes be called"); + Assert.sometimes(true, "ConfigurationUpdater.handleProbesChanges should sometimes be called", null); handleProbesChanges(changes, newConfiguration); } LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.applyNewConfiguration should always be successful (always)"); - Assert.always("ConfigurationUpdater.applyNewConfiguration should always be successful"); + Assert.always(true, "ConfigurationUpdater.applyNewConfiguration should always be successful", null); } finally { LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.applyNewConfiguration should always be reachable (reachable)"); - Assert.reachable("ConfigurationUpdater.applyNewConfiguration should always be reachable"); + Assert.reachable("ConfigurationUpdater.applyNewConfiguration should always be reachable", null); configurationLock.unlock(); } } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java index d33d0e10438..2fe9d6d0567 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java @@ -291,7 +291,7 @@ private static String getJfrRepositoryBase(ConfigProvider configProvider) { } catch (IOException e) { log.error("Failed to create JFR repository directory: {}", repositoryPath, e); log.debug("ANTITHESIS_ASSERT: Failed to create JFR repository directory (unreachable)"); - Assert.unreachable("Failed to create JFR repository directory"); + Assert.unreachable("Failed to create JFR repository directory", null); throw new IllegalStateException( "Failed to create JFR repository directory: " + repositoryPath, e); } diff --git a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java index 46f29c9c9d7..f3af9fede8d 100644 --- a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java +++ b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java @@ -198,11 +198,11 @@ private void startProfilingRecording() { if (t instanceof IllegalStateException && "Shutdown in progress".equals(t.getMessage())) { ProfilerFlareLogger.getInstance().log("Shutdown in progress, cannot start profiling"); log.debug("ANTITHESIS_ASSERT: Shutdown in progress, cannot start profiling (sometimes)"); - Assert.sometimes("Shutdown in progress, cannot start profiling"); + Assert.sometimes(true, "Shutdown in progress, cannot start profiling", null); } else { ProfilerFlareLogger.getInstance().log("Failed to start profiling", t); log.debug("ANTITHESIS_ASSERT: Failed to start profiling (unreachable)", t); - Assert.unreachable("Failed to start profiling"); + Assert.unreachable("Failed to start profiling", null); throw t instanceof RuntimeException ? (RuntimeException) t : new RuntimeException(t); } } @@ -280,7 +280,7 @@ public void snapshot(boolean onShutdown) { // JFR is filtering the stream it will only discard earlier chunks that have an end // time that is before (not before or equal to) the requested start time of the filter. log.debug("ANTITHESIS_ASSERT: Snapshot created (always) - lastSnapshot != null: {}", (lastSnapshot != null)); - Assert.always(lastSnapshot != null, "Snapshot created"); + Assert.always(lastSnapshot != null, "Snapshot created", null); lastSnapshot = recordingData.getEnd().plus(ONE_NANO); dataListener.onNewData(recordingType, recordingData, onShutdown); } else { @@ -289,7 +289,7 @@ public void snapshot(boolean onShutdown) { } catch (final Exception e) { log.error(SEND_TELEMETRY, "Exception in profiling thread, continuing", e); log.debug("ANTITHESIS_ASSERT: Exception in profiling thread, continuing (unreachable)", e); - Assert.unreachable("Exception in profiling thread, continuing"); + Assert.unreachable("Exception in profiling thread, continuing", null); } catch (final Throwable t) { /* Try to continue even after fatal exception. It seems to be useful to attempt to store profile when this happens. @@ -303,7 +303,7 @@ public void snapshot(boolean onShutdown) { // This should almost never happen and there is not much we can do here in cases like // OutOfMemoryError, so we will just ignore this. log.debug("ANTITHESIS_ASSERT: Fatal exception in profiling thread, trying to continue (unreachable)"); - Assert.unreachable("Fatal exception in profiling thread, trying to continue"); + Assert.unreachable("Fatal exception in profiling thread, trying to continue", null); } } } diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java index 45d693fee48..8f1820b7393 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java +++ b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java @@ -190,7 +190,7 @@ public OngoingRecording start() { } catch (IOException | IllegalStateException e) { log.debug("Failed to start Datadog profiler recording", e); log.debug("ANTITHESIS_ASSERT: Failed to start Datadog profiler recording (unreachable)"); - Assert.unreachable("Failed to start Datadog profiler recording"); + Assert.unreachable("Failed to start Datadog profiler recording", null); return null; } } @@ -206,7 +206,7 @@ void stopProfiler() { if (recordingFlag.compareAndSet(true, false)) { profiler.stop(); log.debug("ANTITHESIS_ASSERT: Checking if profiling is still active after stop (sometimes) - active: {}", isActive()); - Assert.sometimes(isActive(),"Profiling is still active. Waiting to stop."); + Assert.sometimes(isActive(),"Profiling is still active. Waiting to stop.", null); if (isActive()) { log.debug("Profiling is still active. Waiting to stop."); while (isActive()) { @@ -214,7 +214,7 @@ void stopProfiler() { } } log.debug("ANTITHESIS_ASSERT: Profiling should be stopped (always) - active: {}", isActive()); - Assert.always(!isActive(),"Profiling is stopped"); + Assert.always(!isActive(),"Profiling is stopped", null); } } @@ -229,7 +229,7 @@ public boolean isActive() { return !status.contains("not active"); } catch (IOException ignored) { log.debug("ANTITHESIS_ASSERT: Failed to get Datadog profiler status (unreachable)"); - Assert.unreachable("Failed to get Datadog profiler status"); + Assert.unreachable("Failed to get Datadog profiler status", null); } return false; } @@ -253,13 +253,13 @@ Path newRecording() throws IOException, IllegalStateException { } recordingFlag.set(false); log.debug("ANTITHESIS_ASSERT: Unable to start Datadog profiler recording (unreachable)"); - Assert.unreachable("Unable to start Datadog profiler recording"); + Assert.unreachable("Unable to start Datadog profiler recording", null); throw e; } return recFile; } log.debug("ANTITHESIS_ASSERT: Datadog profiler session has already been started (unreachable)"); - Assert.unreachable("Datadog profiler session has already been started"); + Assert.unreachable("Datadog profiler session has already been started", null); throw new IllegalStateException("Datadog profiler session has already been started"); } diff --git a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java index 26dc07e0581..0ca6737d926 100644 --- a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java +++ b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java @@ -303,7 +303,7 @@ public void onFailure(final Call call, final IOException e) { // sync request to a misbehaving server. if (handled.compareAndSet(false, true)) { log.debug("ANTITHESIS_ASSERT: Upload timeout (unreachable)"); - Assert.unreachable("Upload timeout"); + Assert.unreachable("Upload timeout", null); handleFailure(call, null, data, onCompletion); } } @@ -355,7 +355,7 @@ private void handleResponse( JfrCliHelper.invokeOn(data, ioLogger); } else { log.debug("ANTITHESIS_ASSERT: Failed to upload profile (unreachable) - response code: {}", response.code()); - Assert.unreachable("Failed to upload profile"); + Assert.unreachable("Failed to upload profile", null); ioLogger.error("Failed to upload profile", getLoggerResponse(response)); } } diff --git a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java index 19a1c26323f..3d27d6c4cd1 100644 --- a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java +++ b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java @@ -83,7 +83,7 @@ public void onNewData(RecordingType type, RecordingData data, boolean handleSync } catch (IOException e) { log.debug("Unable to write debug profile dump", e); log.debug("ANTITHESIS_ASSERT: Unable to write debug profile dump (unreachable)"); - Assert.unreachable("Unable to write debug profile dump"); + Assert.unreachable("Unable to write debug profile dump", null); } } } @@ -173,14 +173,14 @@ public static synchronized boolean run(final boolean earlyStart, Instrumentation need to not forget to add code that removes this shutdown hook from JVM. */ log.debug("ANTITHESIS_ASSERT: Shutdown hook added (always) - uploader != null: {}", (uploader != null)); - Assert.always(uploader!= null, "Shutdown hook added"); + Assert.always(uploader!= null, "Shutdown hook added", null); Runtime.getRuntime().addShutdownHook(new ShutdownHook(profiler, uploader)); } catch (final IllegalStateException ex) { // The JVM is already shutting down. } } catch (final UnsupportedEnvironmentException | ConfigurationException e) { log.debug("ANTITHESIS_ASSERT: Failed to initialize profiling agent (unreachable)", e); - Assert.unreachable("Failed to initialize profiling agent!"); + Assert.unreachable("Failed to initialize profiling agent!", null); ProfilerFlareLogger.getInstance().log("Failed to initialize profiling agent!", e); ProfilerFlareReporter.reportInitializationException(e); } diff --git a/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java b/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java index 6e21bb0c458..332a406865d 100644 --- a/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java +++ b/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java @@ -284,7 +284,7 @@ private boolean initialize() { // We can't recover from this, so we'll not try to initialize again. log.error("Remote configuration poller initialization failed", e); log.debug("ANTITHESIS_ASSERT: Remote configuration poller initialization failed (unreachable)", e); - Assert.unreachable("Remote configuration poller initialization failed"); + Assert.unreachable("Remote configuration poller initialization failed", null); fatalOnInitialization = true; } return true; @@ -383,7 +383,7 @@ private void handleAgentResponse(ResponseBody body) { // no error can be reported, as we don't have the data client.state.targets_version avail ratelimitedLogger.warn("Error parsing remote config response", e); log.debug("ANTITHESIS_ASSERT: Error parsing remote config response (unreachable)", e); - Assert.unreachable("Error parsing remote config response"); + Assert.unreachable("Error parsing remote config response", null); return; } @@ -452,7 +452,7 @@ private void runConfigurationEndListener( try { listener.onConfigurationEnd(); log.debug("ANTITHESIS_ASSERT: Configuration end listener should always be reachable (reachable)"); - Assert.reachable("Configuration end listener should always be reachable"); + Assert.reachable("Configuration end listener should always be reachable", null); } catch (ReportableException re) { errors.add(re); } catch (RuntimeException rte) { @@ -462,7 +462,7 @@ private void runConfigurationEndListener( ratelimitedLogger.warn( "Error running configuration listener {}: {}", listener, rte.getMessage(), rte); log.debug("ANTITHESIS_ASSERT: Error running configuration listener (unreachable)", rte); - Assert.unreachable("Error running configuration listener"); + Assert.unreachable("Error running configuration listener", null); } } From 8d81f30a853ff2455c4d7a0148298590a927c97e Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 17 Dec 2025 16:12:22 +0100 Subject: [PATCH 10/15] Implement simplified Antithesis assertions for trace loss tracking - Add assertions to CoreTracer.write() to track sampling decisions - Add assertions to RemoteWriter.write() to track buffer drops - Add assertions to PayloadDispatcherImpl.accept() to track send success/failure This provides clear tracking of: - trace_accepted_by_sampling: Traces that passed sampling - trace_dropped_by_sampling: Traces dropped by sampling - trace_enqueued_for_send: Traces accepted into buffer - trace_dropped_buffer_overflow: Traces dropped due to full buffer - trace_dropped_by_policy: Traces dropped by policy - trace_dropped_writer_closed: Traces dropped during shutdown - trace_payloads_being_sent: All send attempts - traces_sent_successfully: Traces successfully sent to agent - traces_failed_to_send: Traces that failed HTTP send --- .gitignore | 3 + ANTITHESIS_ASSERTIONS.md | 666 ++++++++++++++++++ EXPORT_VARIABLES_GUIDE.md | 282 ++++++++ FUZZER_SUMMARY.txt | 346 +++++++++ FUZZ_QUICKSTART.md | 236 +++++++ FUZZ_README.md | 237 +++++++ analyze-fuzz-logs.sh | 179 +++++ .../common/writer/PayloadDispatcherImpl.java | 37 + .../trace/common/writer/RemoteWriter.java | 43 ++ .../java/datadog/trace/core/CoreTracer.java | 24 + example-export-only.sh | 65 ++ example-fuzz.sh | 54 ++ example-use-export-vars.sh | 72 ++ fuzz-ci.sh | 133 ++++ fuzz-configs.sh | 374 ++++++++++ fuzz-export-vars.sh | 161 +++++ report-config-types.sh | 174 +++++ 17 files changed, 3086 insertions(+) create mode 100644 ANTITHESIS_ASSERTIONS.md create mode 100644 EXPORT_VARIABLES_GUIDE.md create mode 100644 FUZZER_SUMMARY.txt create mode 100644 FUZZ_QUICKSTART.md create mode 100644 FUZZ_README.md create mode 100755 analyze-fuzz-logs.sh create mode 100755 example-export-only.sh create mode 100755 example-fuzz.sh create mode 100755 example-use-export-vars.sh create mode 100755 fuzz-ci.sh create mode 100755 fuzz-configs.sh create mode 100755 fuzz-export-vars.sh create mode 100755 report-config-types.sh diff --git a/.gitignore b/.gitignore index 68da44a8345..4dbfb0094f9 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,9 @@ replay_pid* !dd-java-agent/benchmark/releases/*.jar **/errors/*.log +# Fuzz testing logs # +fuzz-logs/ + # Magic for local JMC built /vendor/jmc-libs diff --git a/ANTITHESIS_ASSERTIONS.md b/ANTITHESIS_ASSERTIONS.md new file mode 100644 index 00000000000..f10304b9010 --- /dev/null +++ b/ANTITHESIS_ASSERTIONS.md @@ -0,0 +1,666 @@ +# Antithesis Assertions in dd-trace-java + +This document describes the Antithesis assertions added to track trace loss, API sending failures, and telemetry data loss. + +## Overview + +Antithesis assertions have been added to multiple classes in the trace writing pipeline and telemetry system to detect when traces/telemetry are lost or fail to send to the API. These assertions help ensure the reliability of trace collection, telemetry reporting, and transmission at every stage of the process. + +## Added Assertions + +### Overview by Location + +**Telemetry System:** +- **TelemetryClient** - Monitors telemetry HTTP requests, failures, and network issues +- **TelemetryRouter** - Tracks routing failures and endpoint failover + +**Trace System:** +- **DDAgentApi** - Monitors agent communication, HTTP responses, and network failures +- **PayloadDispatcherImpl** - Tracks trace sending to the API and pre-send drops +- **RemoteWriter** - Tracks buffer overflow and shutdown scenarios + +--- + +## TelemetryClient Assertions (Telemetry Sending Layer) + +### T1. Telemetry Activity Tracking (`reachable` assertion) + +**Location:** `TelemetryClient.sendHttpRequest()` method (line 102) + +**Property:** `"Telemetry sending is exercised"` + +**Type:** `Assert.reachable()` + +**Purpose:** Verifies that telemetry sending code is being exercised during testing. + +--- + +### T2. Telemetry Success Validation (`always` assertion) 🔴 **CRITICAL** + +**Location:** `TelemetryClient.sendHttpRequest()` method, success path (line 153) + +**Property:** `"Telemetry requests should always succeed - no telemetry data should be lost"` + +**Type:** `Assert.always()` + +**Purpose:** Asserts that ALL telemetry requests should succeed. When this fails, it indicates that **telemetry data is being dropped** instead of being retried or buffered. + +**Details Captured:** +- `request_type`: Type of telemetry request (app-started, app-closing, etc.) +- `http_status`: HTTP response code +- `http_message`: HTTP status message +- `url`: Telemetry endpoint URL +- `success`: Whether request succeeded + +**The Problem This Detects:** +Your warning message: `"Got FAILURE sending telemetry request"` - indicates telemetry data is being **dropped** without retry mechanism. + +--- + +### T3. Telemetry HTTP Failure Detection (`unreachable` assertion) 🔴 + +**Location:** `TelemetryClient.sendHttpRequest()` method, non-success response (line 140) + +**Property:** `"Telemetry HTTP request failed - telemetry data should not be dropped, should retry"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Marks the HTTP failure path as unreachable, indicating telemetry data loss. **This is the exact issue you're experiencing** - failures cause data to be dropped instead of retried. + +**Details Captured:** +- `request_type`: Type of telemetry request +- `http_status`: Error status code +- `http_message`: Error message +- `url`: Endpoint URL +- `reason`: "http_error_response" + +--- + +### T4. Telemetry Network Exception Prevention (`unreachable` assertion) 🔴 + +**Location:** `TelemetryClient.sendHttpRequest()` method, IOException catch (line 171) + +**Property:** `"Telemetry network/IO failure - telemetry data should not be dropped, should retry"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Marks network failures as unreachable. When triggered, indicates telemetry data is being lost due to connectivity issues **without retry**. + +**Details Captured:** +- `request_type`: Type of telemetry request +- `exception_type`: Exception class name +- `exception_message`: Exception details +- `url`: Endpoint URL +- `reason`: "network_io_exception" + +--- + +### T5. Telemetry 404 Tracking (`sometimes` assertion) + +**Location:** `TelemetryClient.sendHttpRequest()` method, 404 response (line 122) + +**Property:** `"Telemetry endpoint returns 404 - endpoint may be disabled"` + +**Type:** `Assert.sometimes()` + +**Purpose:** Tracks when telemetry endpoint is disabled (404). This may be acceptable in some configurations. + +**Details Captured:** +- `request_type`: Type of telemetry request +- `url`: Endpoint URL +- `reason`: "endpoint_disabled_404" + +--- + +## TelemetryRouter Assertions (Telemetry Routing Layer) + +### T6. Telemetry Routing Success (`always` assertion) 🔴 **CRITICAL** + +**Location:** `TelemetryRouter.sendRequest()` method (line 56) + +**Property:** `"Telemetry routing should always succeed - failures indicate data loss without retry mechanism"` + +**Type:** `Assert.always()` + +**Purpose:** Validates that telemetry routing succeeds. This is the **top-level** assertion that catches all telemetry failures and proves that **current failures result in data loss**. + +**Details Captured:** +- `result`: SUCCESS, FAILURE, NOT_FOUND, or INTERRUPTED +- `current_client`: "agent" or "intake" +- `request_failed`: Boolean +- `has_fallback`: Whether fallback client exists +- `url`: Current endpoint URL + +--- + +### T7. Agent Telemetry Failover Tracking (`unreachable` assertion) 🔴 + +**Location:** `TelemetryRouter.sendRequest()` method, agent failure (line 70) + +**Property:** `"Agent telemetry endpoint failed - switching to intake but current request data is lost"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Tracks when agent telemetry fails and router switches to intake. **Critical:** The current request data is LOST during this failover - only future requests go to intake. + +**Details Captured:** +- `result`: Failure result type +- `url`: Agent endpoint URL +- `has_intake_fallback`: Whether intake fallback is available +- `reason`: "agent_telemetry_failure" + +--- + +### T8. Intake Telemetry Failover Tracking (`unreachable` assertion) 🔴 + +**Location:** `TelemetryRouter.sendRequest()` method, intake failure (line 90) + +**Property:** `"Intake telemetry endpoint failed - switching to agent but current request data is lost"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Tracks when intake telemetry fails and router switches back to agent. **Critical:** The current request data is LOST during this failover. + +**Details Captured:** +- `result`: Failure result type +- `url`: Intake endpoint URL +- `will_fallback_to_agent`: Boolean +- `reason`: "intake_telemetry_failure" + +--- + +## DDAgentApi Assertions (Agent Communication Layer) + +### 1. Agent API Activity Tracking (`reachable` + `sometimes` assertions) + +**Location:** `DDAgentApi.sendSerializedTraces()` method start (line 97-100) + +**Properties:** +- `"DDAgentApi trace sending is exercised"` (reachable) +- `"Traces are being sent through DDAgentApi"` (sometimes) + +**Type:** `Assert.reachable()` + `Assert.sometimes()` + +**Purpose:** Verifies that the DDAgentApi code path is being exercised and traces are flowing through the agent API layer. + +--- + +### 2. Agent Detection Validation (`unreachable` assertion) 🔴 + +**Location:** `DDAgentApi.sendSerializedTraces()` method, agent detection failure (line 107) + +**Property:** `"Datadog agent should always be detected - agent communication failure"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Asserts that the Datadog agent should always be discoverable. If the agent cannot be detected, traces will be lost with a 404 error. + +**Details Captured:** +- `trace_count`: Number of traces that cannot be sent +- `payload_size_bytes`: Size of the payload +- `agent_url`: The agent URL being contacted +- `failure_reason`: "agent_not_detected" + +**When This Occurs:** +- Agent is not running +- Agent is unreachable (network/firewall issues) +- Incorrect agent URL configuration +- Agent discovery mechanism failure + +--- + +### 3. HTTP Response Success Validation (`always` assertion) 🔴 **CRITICAL** + +**Location:** `DDAgentApi.sendSerializedTraces()` method, after HTTP call (line 149) + +**Property:** `"HTTP response from Datadog agent should always be 200 - API communication failure"` + +**Type:** `Assert.always()` + +**Purpose:** Validates that every HTTP response from the agent is successful (200 OK). This is the primary assertion for detecting API-level failures. + +**Details Captured:** +- `trace_count`: Number of traces being sent +- `payload_size_bytes`: Size of the payload +- `http_status`: HTTP status code received +- `http_message`: HTTP status message +- `success`: Boolean indicating if status is 200 +- `agent_url`: Full URL of the traces endpoint + +**When This Fails:** +- Agent returns error status codes (400, 413, 500, 503, etc.) +- Authentication/authorization failures +- Agent overload or resource exhaustion +- Malformed requests + +--- + +### 4. HTTP Error Path Unreachability (`unreachable` assertion) 🔴 + +**Location:** `DDAgentApi.sendSerializedTraces()` method, non-200 response branch (line 163) + +**Property:** `"Non-200 HTTP response from agent indicates API failure - traces may be lost"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Marks the non-200 response code path as unreachable. When reached, indicates traces are being rejected by the agent. + +**Details Captured:** +- `trace_count`: Number of traces rejected +- `payload_size_bytes`: Size of rejected payload +- `http_status`: Error status code +- `http_message`: Error message from agent +- `failure_reason`: "http_error_response" + +**Common Status Codes:** +- 400: Bad Request (malformed payload) +- 413: Payload Too Large +- 429: Too Many Requests (rate limiting) +- 500: Internal Server Error +- 503: Service Unavailable (agent overloaded) + +--- + +### 5. Network Exception Prevention (`unreachable` assertion) 🔴 + +**Location:** `DDAgentApi.sendSerializedTraces()` method, IOException catch block (line 199) + +**Property:** `"Network/IO exceptions should not occur when sending to agent - indicates connectivity issues"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Asserts that network/IO exceptions should never occur when communicating with the agent. These indicate infrastructure or connectivity problems. + +**Details Captured:** +- `trace_count`: Number of traces that failed to send +- `payload_size_bytes`: Size of the payload +- `exception_type`: Full class name of the exception +- `exception_message`: Exception message +- `agent_url`: Agent URL being contacted +- `failure_reason`: "network_io_exception" + +**When This Occurs:** +- Network connectivity issues +- Connection timeouts +- DNS resolution failures +- Socket errors +- SSL/TLS handshake failures + +--- + +## PayloadDispatcherImpl Assertions (Trace Serialization Layer) + +### 6. Payload Dispatcher Activity Tracking (`reachable` + `sometimes` assertions) + +**Location:** `PayloadDispatcherImpl.accept()` method (line 110-113) + +**Properties:** +- `"Trace sending code path is exercised"` (reachable) +- `"Traces are being sent to the API"` (sometimes) + +**Type:** `Assert.reachable()` + `Assert.sometimes()` + +**Purpose:** Verifies that the PayloadDispatcher code path is being exercised and traces are flowing through. + +--- + +### 7. Trace Sending Success (`always` assertion) + +**Location:** `PayloadDispatcherImpl.accept()` method (line 136) + +**Property:** `"Trace sending to API should always succeed - no traces should be lost"` + +**Type:** `Assert.always()` + +**Purpose:** Asserts that every trace sending attempt should succeed. If this assertion fails, it indicates that traces are being lost due to API failures. + +**Details Captured:** +- `trace_count`: Number of traces in the payload +- `payload_size_bytes`: Size of the payload in bytes +- `success`: Whether the send was successful +- `exception`: Exception class name (if present) +- `exception_message`: Exception message (if present) +- `http_status`: HTTP response status code (if present) + +### 8. Send Failure Path (`unreachable` assertion) + +**Location:** `PayloadDispatcherImpl.accept()` method, failure branch (line 159) + +**Property:** `"Trace sending failure path should never be reached - indicates traces are being lost"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Marks the failure path as something that should never occur. When this path is reached, it indicates traces are being lost due to send failures. + +**Details Captured:** +- `trace_count`: Number of traces that failed to send +- `payload_size_bytes`: Size of failed payload +- `exception`: Exception class name (if present) +- `exception_message`: Exception message (if present) +- `http_status`: HTTP response status code (if present) + +### 9. Trace Drop Prevention (`unreachable` assertion) + +**Location:** `PayloadDispatcherImpl.onDroppedTrace()` method (line 69) + +**Property:** `"Traces should not be dropped before attempting to send - indicates buffer overflow or backpressure"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Asserts that traces should never be dropped before even attempting to send them. Drops indicate buffer overflow, backpressure, or resource exhaustion. + +**Details Captured:** +- `span_count`: Number of spans in the dropped trace +- `total_dropped_traces`: Cumulative count of dropped traces +- `total_dropped_spans`: Cumulative count of dropped spans + +--- + +## RemoteWriter Assertions (Buffer and Lifecycle Layer) + +### 10. Writer State Validation (`always` assertion) 🔴 **CRITICAL** + +**Location:** `RemoteWriter.write()` method, start of method (line 79) + +**Property:** `"Writer should never be closed when attempting to write traces"` + +**Type:** `Assert.always()` + +**Purpose:** Proactively validates that the writer is in a valid state (not closed) whenever traces are being written. This assertion catches improper usage where traces are written after shutdown or during shutdown race conditions. This is a **preventive assertion** that checks every write attempt. + +**Details Captured:** +- `writer_closed`: Boolean indicating if writer is closed +- `trace_size`: Number of traces being written +- `has_traces`: Whether the trace list is non-empty + +**When This Fails:** +- Application attempts to write traces after calling `close()` +- Race condition between shutdown and trace generation +- Improper lifecycle management +- Indicates a bug in the calling code or shutdown sequencing + +**Importance:** This is a critical assertion because writing to a closed writer indicates a fundamental problem with lifecycle management that could lead to: +- Lost traces during shutdown +- Inconsistent application state +- Potential resource leaks + +--- + +### 11. Buffer Overflow Detection (`unreachable` assertion) 🔴 **CRITICAL** + +**Location:** `RemoteWriter.write()` method, DROPPED_BUFFER_OVERFLOW case (line 117) + +**Property:** `"Buffer overflow should never occur - traces are being dropped due to backpressure"` + +**Type:** `Assert.unreachable()` + +**Purpose:** Asserts that buffer overflow should NEVER happen. This indicates that traces are being generated faster than they can be processed and serialized, resulting in dropped traces. This is a critical issue that indicates system overload or insufficient buffer capacity. + +**Details Captured:** +- `trace_size`: Number of traces being dropped +- `span_count`: Total number of spans in the dropped traces +- `sampling_priority`: Sampling priority of the trace +- `buffer_capacity`: Current buffer capacity +- `reason`: "buffer_overflow_backpressure" + +**When This Occurs:** +- Internal processing queue is full (primary, secondary, or span sampling queue) +- Traces are being generated faster than serialization can occur +- System is under heavy load or experiencing backpressure +- Buffer size may be insufficient for the workload + +--- + +### 12. Shutdown Trace Drop Tracking (`sometimes` assertion) + +**Location:** `RemoteWriter.write()` method, closed writer case (line 94) + +**Property:** `"Traces are dropped due to writer shutdown - tracking shutdown behavior"` + +**Type:** `Assert.sometimes()` + +**Purpose:** Tracks when traces are dropped because the writer has been shut down. This helps understand shutdown behavior and whether traces are being lost during application shutdown sequences. + +**Details Captured:** +- `trace_size`: Number of traces being dropped +- `span_count`: Total number of spans in the dropped traces +- `reason`: "writer_closed_during_shutdown" + +**When This Occurs:** +- Application is shutting down +- Writer.close() has been called +- Traces are still being generated after shutdown initiated +- Can indicate timing issues in shutdown sequences + +## How Antithesis Uses These Assertions + +When running under Antithesis testing: + +1. **Property Aggregation:** All assertions with the same `message` are aggregated into a single test property in the triage report. + +2. **Failure Detection:** + - `always()` assertions that evaluate to `false` will flag the property as failing + - `unreachable()` assertions that are reached will flag the property as failing + - `sometimes()` assertions that never evaluate to `true` will flag the property as failing + +3. **Exploration Guidance:** Antithesis uses these assertions as hints to explore states that might trigger failures, making bug detection more efficient. + +4. **Non-Terminating:** Unlike traditional assertions, Antithesis assertions do not terminate the program when they fail. This allows Antithesis to potentially escalate the failure into more severe bugs. + +## Expected Behavior + +### In a Healthy System + +**Telemetry System:** +- ✅ `"Telemetry sending is exercised"` - Should pass (reached at least once) +- ✅ `"Telemetry requests should always succeed"` - Should pass (all succeed) 🔴 **CRITICAL** +- ✅ `"Telemetry HTTP request failed - should retry"` - Should pass (never reached) 🔴 +- ✅ `"Telemetry network/IO failure - should retry"` - Should pass (never reached) 🔴 +- ✅ `"Telemetry routing should always succeed"` - Should pass (all succeed) 🔴 **CRITICAL** +- ✅ `"Agent telemetry endpoint failed"` - Should pass (never reached) 🔴 +- ✅ `"Intake telemetry endpoint failed"` - Should pass (never reached) 🔴 +- ℹ️ `"Telemetry endpoint returns 404"` - May occur if endpoint disabled + +**Trace DDAgentApi Layer:** +- ✅ `"DDAgentApi trace sending is exercised"` - Should pass (reached at least once) +- ✅ `"Traces are being sent through DDAgentApi"` - Should pass (reached at least once) +- ✅ `"Datadog agent should always be detected"` - Should pass (agent always detectable) 🔴 +- ✅ `"HTTP response from Datadog agent should always be 200"` - Should pass (all responses 200) 🔴 **CRITICAL** +- ✅ `"Non-200 HTTP response from agent indicates API failure"` - Should pass (never reached) 🔴 +- ✅ `"Network/IO exceptions should not occur"` - Should pass (never reached) 🔴 + +**PayloadDispatcherImpl Layer:** +- ✅ `"Trace sending code path is exercised"` - Should pass (reached at least once) +- ✅ `"Traces are being sent to the API"` - Should pass (reached at least once) +- ✅ `"Trace sending to API should always succeed"` - Should pass (all sends succeed) +- ✅ `"Trace sending failure path should never be reached"` - Should pass (never reached) +- ✅ `"Traces should not be dropped before attempting to send"` - Should pass (never reached) + +**RemoteWriter Layer:** +- ✅ `"Writer should never be closed when attempting to write traces"` - Should pass (writer always open) 🔴 **CRITICAL** +- ✅ `"Buffer overflow should never occur"` - Should pass (never reached) 🔴 **CRITICAL** +- ℹ️ `"Traces are dropped due to writer shutdown"` - May or may not occur depending on shutdown timing + +### When Telemetry Is Lost ⚠️ **YOUR ISSUE** + +If telemetry is being lost (your current issue with `"Got FAILURE sending telemetry request"`), you'll see these failures: + +**Telemetry HTTP/Network Failures:** +- ❌ `"Telemetry requests should always succeed"` - Will fail on any telemetry failure 🔴 **CRITICAL** + - This is the top-level assertion proving telemetry data loss + - Shows request type, HTTP status, and endpoint +- ❌ `"Telemetry HTTP request failed - should retry"` - Will fail when HTTP errors occur 🔴 + - Indicates telemetry dropped due to HTTP errors (5xx, 4xx) + - Shows status code and error message +- ❌ `"Telemetry network/IO failure - should retry"` - Will fail on connectivity issues 🔴 + - Indicates telemetry dropped due to network problems + - Shows exception type and message + +**Telemetry Routing Failures:** +- ❌ `"Telemetry routing should always succeed"` - Will fail when routing fails 🔴 **CRITICAL** + - Proves current implementation drops data instead of retrying + - Shows current client (agent/intake) and failure details +- ❌ `"Agent telemetry endpoint failed - current request data is lost"` - Will fail when agent endpoint fails 🔴 + - Router switches to intake but **current request is dropped** + - Shows whether fallback is available +- ❌ `"Intake telemetry endpoint failed - current request data is lost"` - Will fail when intake endpoint fails 🔴 + - Router switches to agent but **current request is dropped** + - Future requests use new endpoint, but current data is lost + +**Key Finding:** The assertions prove that when telemetry fails, the **current request is DROPPED** - the router only changes the endpoint for **future** requests. This is why you see `"Got FAILURE"` warnings - there's no retry or buffering mechanism. + +--- + +### When Traces Are Lost + +If traces are being lost, you'll see failures in the triage report: + +**Agent Communication Failures (DDAgentApi):** +- ❌ `"Datadog agent should always be detected"` - Will fail if agent is unreachable 🔴 + - Indicates agent not running, network issues, or configuration problems + - Provides agent URL and detection details +- ❌ `"HTTP response from Datadog agent should always be 200"` - Will fail on any error status 🔴 **CRITICAL** + - Shows HTTP status code, message, and agent URL + - Indicates agent overload, rate limiting, or request errors +- ❌ `"Non-200 HTTP response from agent indicates API failure"` - Will fail when agent rejects traces 🔴 + - Provides HTTP status codes (400, 413, 429, 500, 503, etc.) +- ❌ `"Network/IO exceptions should not occur"` - Will fail on network errors 🔴 + - Shows exception type and message + - Indicates connectivity, timeout, or DNS issues + +**API Send Failures (PayloadDispatcherImpl):** +- ❌ `"Trace sending to API should always succeed"` - Will fail with details about failed sends +- ❌ `"Trace sending failure path should never be reached"` - Will fail, showing this path was reached + +**Buffer/Queue Issues:** +- ❌ `"Buffer overflow should never occur"` - Will fail if backpressure causes drops 🔴 **CRITICAL** + - Indicates system overload or insufficient buffer capacity + - Provides buffer capacity and trace details +- ❌ `"Traces should not be dropped before attempting to send"` - May fail if drops occur in PayloadDispatcher + +**Lifecycle/Shutdown Issues:** +- ❌ `"Writer should never be closed when attempting to write traces"` - Will fail if traces written to closed writer 🔴 **CRITICAL** + - Indicates race condition in shutdown sequence + - Shows improper lifecycle management + - Provides details about writer state and trace being written +- ⚠️ `"Traces are dropped due to writer shutdown"` - Will show in report if shutdown timing causes trace loss + - Helps identify if shutdown sequence needs improvement + - May be acceptable depending on shutdown strategy + - Works in conjunction with the writer state validation above + +The `details` captured in failed assertions will provide diagnostic information including trace counts, payload sizes, exceptions, HTTP status codes, buffer capacity, and sampling priority. + +## Dependencies + +- **Antithesis SDK:** `com.antithesis:sdk:1.4.5` (bundled in tracer JAR) - [Available on Maven Central](https://repo1.maven.org/maven2/com/antithesis/sdk/) +- **Jackson:** Already available transitively in the project + +### Bundled SDK + +The Antithesis SDK is configured as an `implementation` dependency, which means: + +- ✅ **Bundled in final JAR** - SDK classes included in the dd-trace-java agent +- ✅ **Always available** - No ClassNotFoundException at runtime +- ✅ **Works everywhere** - Assertions compiled and available in all environments + +### Using Antithesis Assertions + +The Antithesis SDK (version 1.4.5) is publicly available on Maven Central and is bundled with the tracer. + +**In normal runtime (production/development):** +- Assertions are present in the code but have **minimal performance impact** +- According to [Antithesis documentation](https://antithesis.com/docs/properties_assertions/assertions/), the SDK is designed to run safely in production +- Assertions become no-ops when not running in Antithesis environment + +**In Antithesis testing environment:** +- Antithesis runtime automatically detects and evaluates all assertions +- Generates triage reports showing which properties passed/failed +- Provides detailed bug reports with reproducible scenarios +- Contact Antithesis at [antithesis.com](https://antithesis.com) for access to their testing platform + +## Complete Pipeline Coverage Summary + +The assertions provide comprehensive coverage across telemetry and trace pipelines: + +### Telemetry Pipeline + +``` +Application Telemetry Events + ↓ +[TelemetryRouter] ← Assertions T6-T8 + • Routing success validation + • Agent failover tracking + • Intake failover tracking + • ⚠️ PROVES: Current request dropped on failover + ↓ +[TelemetryClient] ← Assertions T1-T5 + • Activity tracking + • HTTP success validation + • Failure detection (HTTP errors) + • Network exception handling + • 404 endpoint tracking + • ⚠️ PROVES: No retry on failure + ↓ +[Telemetry Endpoint] → Datadog Backend +``` + +### Trace Pipeline + +``` +Application Threads + ↓ +[CoreTracer] → Sampling decision + ↓ +[RemoteWriter] ← Assertions 10-12 + • Writer state validation + • Buffer overflow detection + • Shutdown tracking + ↓ +[TraceProcessingWorker] → Serialization queues + ↓ +[PayloadDispatcherImpl] ← Assertions 6-9 + • Activity tracking + • Trace sending validation + • Failure path detection + • Pre-send drop prevention + ↓ +[DDAgentApi] ← Assertions 1-5 + • Agent detection + • HTTP response validation + • Network exception handling + ↓ +[Datadog Agent] → Backend +``` + +### Assertion Count by Category + +| Category | Count | Criticality | Status | +|----------|-------|-------------|--------| +| **Telemetry Communication** | 5 | 🔴 **CRITICAL** | ⚠️ **DROPS DATA** | +| **Telemetry Routing** | 3 | 🔴 **CRITICAL** | ⚠️ **DROPS DATA** | +| **Agent Communication** | 5 | 🔴 **CRITICAL** | ✅ Has retries | +| **Trace Serialization** | 4 | ❌ High | ✅ Good | +| **Buffer Management** | 2 | 🔴 **CRITICAL** | ✅ Good | +| **Lifecycle Management** | 1 | 🔴 **CRITICAL** | ✅ Good | +| **Total** | **20** | - | - | + +### Key Properties Monitored + +**Telemetry System (YOUR ISSUE):** +1. ⚠️ **Telemetry Data Loss**: Telemetry dropped on HTTP/network failures +2. ⚠️ **No Retry Mechanism**: Failed requests are not retried or buffered +3. ⚠️ **Failover Data Loss**: Current request dropped during endpoint switching + +**Trace System:** +4. **Agent Availability**: Agent must be detectable and reachable +5. **HTTP Success**: All agent responses must be 200 OK +6. **Network Stability**: No IO/network exceptions should occur +7. **Buffer Capacity**: No overflow or backpressure drops +8. **Lifecycle Correctness**: No writes to closed writer +9. **End-to-End Success**: All traces must be successfully sent + +## References + +- [Antithesis Assertions Documentation](https://antithesis.com/docs/properties_assertions/assertions/) +- [Java SDK Reference](https://antithesis.com/docs/generated/sdk/java/com/antithesis/sdk/Assert.html) + diff --git a/EXPORT_VARIABLES_GUIDE.md b/EXPORT_VARIABLES_GUIDE.md new file mode 100644 index 00000000000..ace4b58d3d3 --- /dev/null +++ b/EXPORT_VARIABLES_GUIDE.md @@ -0,0 +1,282 @@ +# Exporting DD Configuration Variables + +This document explains how to export random DD configuration variables without running a command, allowing you to use them in your own scripts. + +## Two Approaches Available + +### Approach 1: `fuzz-export-vars.sh` (Recommended) + +A standalone script that generates export statements you can eval in your shell. + +#### Usage + +```bash +# Export random variables +eval "$(./fuzz-export-vars.sh)" + +# Then run your application +java -javaagent:dd-java-agent.jar -jar myapp.jar +``` + +#### Advantages +- Simple and straightforward +- Works in any script +- No need to source anything +- Clean output + +#### Control Number of Parameters + +```bash +# Export 5 random parameters +FUZZ_MAX_PARAMS=5 eval "$(./fuzz-export-vars.sh)" +``` + +### Approach 2: `fuzz-configs.sh` in Export-Only Mode + +Use the main fuzzer script in export-only mode. + +#### Usage + +```bash +# Set export-only mode +export FUZZ_EXPORT_ONLY=true + +# Source the fuzzer (doesn't run commands) +source ./fuzz-configs.sh 1 "" +``` + +#### Advantages +- Uses the same script as full fuzzing +- Includes logging +- More detailed output + +## Examples + +### Example 1: Basic Export and Run + +```bash +#!/bin/bash + +# Export random configurations +eval "$(./fuzz-export-vars.sh)" + +# Run your application +java -javaagent:dd-java-agent.jar -jar myapp.jar +``` + +### Example 2: Multiple Test Runs with Different Configs + +```bash +#!/bin/bash + +for i in {1..10}; do + echo "Test run $i" + + # Clear previous DD variables + unset $(env | grep '^DD_' | cut -d'=' -f1) + + # Export new random configuration + eval "$(./fuzz-export-vars.sh)" + + # Run your application + java -jar myapp.jar + + sleep 2 +done +``` + +### Example 3: Export Specific Number of Parameters + +```bash +#!/bin/bash + +# Export only 3 random parameters +FUZZ_MAX_PARAMS=3 eval "$(./fuzz-export-vars.sh)" + +echo "Running with minimal configuration:" +env | grep '^DD_' + +java -jar myapp.jar +``` + +### Example 4: Capture Variables for Later Use + +```bash +#!/bin/bash + +# Generate and save export statements +./fuzz-export-vars.sh 2>/dev/null > /tmp/dd-config.sh + +# Review the configuration +cat /tmp/dd-config.sh + +# Apply it when ready +source /tmp/dd-config.sh + +# Run your application +java -jar myapp.jar +``` + +### Example 5: Use in CI/CD Pipeline + +```bash +#!/bin/bash +# .gitlab-ci.yml or similar + +test_with_random_config: + script: + - eval "$(./fuzz-export-vars.sh)" + - echo "Testing with configuration:" + - env | grep '^DD_' + - mvn clean test +``` + +## Clearing Variables + +To clear all DD_ environment variables: + +```bash +# Unset all DD_ variables +unset $(env | grep '^DD_' | cut -d'=' -f1) + +# Verify +env | grep '^DD_' # Should return nothing +``` + +## Comparing Approaches + +| Feature | fuzz-export-vars.sh | fuzz-configs.sh (export-only) | +|---------|---------------------|-------------------------------| +| Simplicity | ⭐⭐⭐ Very simple | ⭐⭐ Moderate | +| Logging | ⭐ To stderr only | ⭐⭐⭐ Full logging | +| File size | ⭐⭐⭐ Lightweight | ⭐ Larger | +| Dependencies | Just jq | jq + full script | +| Use case | Quick exports | Integrated testing | + +## Troubleshooting + +### Variables Not Exported + +```bash +# Wrong - runs in subshell, variables don't persist +$(./fuzz-export-vars.sh) + +# Correct - use eval +eval "$(./fuzz-export-vars.sh)" +``` + +### Too Many/Few Variables + +```bash +# Control the number +FUZZ_MAX_PARAMS=5 eval "$(./fuzz-export-vars.sh)" +``` + +### Need to See What's Being Exported + +```bash +# The script outputs info to stderr, so you'll see it +eval "$(./fuzz-export-vars.sh)" + +# Or capture the export statements first +./fuzz-export-vars.sh 2>&1 | tee /tmp/config.log | tail -n +2 | source /dev/stdin +``` + +## Integration Patterns + +### Pattern 1: Test Suite Integration + +```bash +#!/bin/bash +# run-test-suite.sh + +for test in tests/*.sh; do + echo "Running $test with random config..." + + # Fresh configuration for each test + unset $(env | grep '^DD_' | cut -d'=' -f1) + eval "$(./fuzz-export-vars.sh)" + + bash "$test" +done +``` + +### Pattern 2: Docker Container Testing + +```bash +#!/bin/bash + +# Generate configuration +eval "$(./fuzz-export-vars.sh)" + +# Pass to Docker container +docker run \ + $(env | grep '^DD_' | sed 's/^/-e /') \ + my-app:latest +``` + +### Pattern 3: Configuration Files + +```bash +#!/bin/bash + +# Generate Java system properties file +./fuzz-export-vars.sh 2>/dev/null | \ + sed 's/export //' | \ + sed "s/'//g" | \ + sed 's/^/-D/' | \ + sed 's/=/=/' > /tmp/java-opts.txt + +# Use with Java +java @/tmp/java-opts.txt -jar myapp.jar +``` + +## Best Practices + +1. **Clear Between Runs**: Always unset previous DD_ variables before exporting new ones +2. **Log Configuration**: Save the exported configuration for reproducibility +3. **Reasonable Limits**: Use FUZZ_MAX_PARAMS to avoid overwhelming configurations +4. **Test Isolation**: Each test should use a fresh set of variables +5. **Document**: Save configurations that expose bugs for later reproduction + +## Environment Variables + +### For `fuzz-export-vars.sh` + +- `FUZZ_MAX_PARAMS` - Maximum number of parameters to export (default: 10) + +### For `fuzz-configs.sh` (export-only mode) + +- `FUZZ_EXPORT_ONLY` - Set to "true" to enable export-only mode +- Other variables from main fuzzer still apply + +## See Also + +- `FUZZ_README.md` - Full fuzzer documentation +- `FUZZ_QUICKSTART.md` - Quick start guide +- `example-use-export-vars.sh` - Working example script + +## Summary + +**Quick Export:** +```bash +eval "$(./fuzz-export-vars.sh)" +java -jar myapp.jar +``` + +**With Options:** +```bash +FUZZ_MAX_PARAMS=5 eval "$(./fuzz-export-vars.sh)" +``` + +**In a Loop:** +```bash +for i in {1..10}; do + unset $(env | grep '^DD_' | cut -d'=' -f1) + eval "$(./fuzz-export-vars.sh)" + java -jar myapp.jar +done +``` + +That's it! You're ready to use random DD configurations in your scripts. + diff --git a/FUZZER_SUMMARY.txt b/FUZZER_SUMMARY.txt new file mode 100644 index 00000000000..4404bc43a73 --- /dev/null +++ b/FUZZER_SUMMARY.txt @@ -0,0 +1,346 @@ +================================================================================ +DD-TRACE-JAVA CONFIGURATION FUZZER +================================================================================ + +PROJECT: dd-trace-java Configuration Fuzzing Tool +CREATED: November 28, 2024 +PURPOSE: Test Java applications with randomized dd-trace-java configurations + +================================================================================ +FILES CREATED +================================================================================ + +1. CORE SCRIPTS + ------------ + ✓ fuzz-configs.sh (13KB) + Main fuzzing script that generates random configurations and runs your app + - Reads 1,384 parameters from metadata/supported-configurations.json + - Intelligent type detection (Boolean, Integer, Float, String) + - Generates sensible random values based on parameter patterns + - Logs all runs with full configuration and output + - Timeout protection (30s default) + - Up to 10 random parameters per run + + ✓ analyze-fuzz-logs.sh (6.3KB) + Log analyzer that examines fuzzing results + - Statistical analysis of runs + - Failure pattern detection + - Parameter frequency analysis + - Recommendations based on results + + ✓ report-config-types.sh (6KB) + Configuration type reporter + - Analyzes all 1,384 parameters + - Reports type distribution + - Shows samples of each type + - JSON export capability + + ✓ fuzz-ci.sh (4.3KB) + CI/CD integration script + - Configurable via environment variables + - Failure threshold checking + - Non-interactive mode support + - Exit codes for pipeline integration + + ✓ example-fuzz.sh (1.8KB) + Quick start demonstration script + - Prerequisite checking + - Simple test run + - Usage examples + +2. DOCUMENTATION + ------------- + ✓ FUZZ_README.md (6.6KB) + Comprehensive documentation covering: + - Features and overview + - Prerequisites and installation + - Detailed usage instructions + - Parameter type detection + - Configuration options + - Troubleshooting guide + - Advanced usage patterns + + ✓ FUZZ_QUICKSTART.md (6.1KB) + Quick start guide with: + - Fast setup instructions + - Usage examples + - Output explanations + - Common use cases + - Tips for effective fuzzing + + ✓ FUZZER_SUMMARY.txt (this file) + Complete summary of the fuzzer system + +3. CONFIGURATION + ------------- + ✓ .gitignore (updated) + Added: fuzz-logs/ directory exclusion + +================================================================================ +CAPABILITIES +================================================================================ + +PARAMETER TYPES DETECTED: +------------------------- +From 1,384 total configuration parameters: + - Boolean parameters: 779 (56%) + - Integer parameters: 95 (6%) + - Float parameters: 128 (9%) + - String parameters: 382 (27%) + +VALUE GENERATION: +---------------- +Boolean: true, false, 1, 0 +Integer: Context-aware ranges: + - Ports: 1024-65535 + - Timeouts: 100-30000ms + - Sizes/Limits: 10-10000 + - Counts: 1-100 + - Rates/Percents: 0-100 +Float: Context-aware: + - Sample rates: 0.0-1.0 + - Intervals: 1.0-60.0 +String: Intelligent generation: + - DD_ENV: production, staging, development, test, qa + - DD_SERVICE: Realistic service names + - DD_VERSION: Semantic versions (v1.2.3) + - Hosts: localhost, IPs, hostnames + - URLs: HTTP/HTTPS endpoints + - Paths: Realistic file/directory paths + - Keys/Tokens: Random hex strings + - Tags: Comma-separated key:value pairs + - Propagation styles: datadog, b3, tracecontext + - Modes: full, service, disabled, safe + +FEATURES: +-------- +✓ Intelligent type detection from parameter names +✓ Realistic value generation +✓ Configurable iteration count +✓ Random parameter selection (1-10 per run) +✓ Comprehensive logging +✓ Timeout protection (prevents hangs) +✓ Color-coded terminal output +✓ Statistical analysis +✓ Failure pattern detection +✓ CI/CD integration support +✓ JSON export for reports +✓ Non-interactive mode + +================================================================================ +USAGE EXAMPLES +================================================================================ + +BASIC USAGE: +----------- +# Run 10 fuzz iterations +./fuzz-configs.sh 10 "java -jar myapp.jar" + +# With dd-java-agent +./fuzz-configs.sh 20 "java -javaagent:dd-java-agent.jar -jar myapp.jar" + +# Analyze results +./analyze-fuzz-logs.sh + +# View parameter types +./report-config-types.sh + +QUICK START: +----------- +# Run example test +./example-fuzz.sh + +CI/CD INTEGRATION: +----------------- +# In your CI pipeline (GitHub Actions, Jenkins, etc.) +export FUZZ_ITERATIONS=50 +export FUZZ_JAVA_CMD="java -javaagent:dd-java-agent.jar -jar app.jar" +export FUZZ_FAILURE_THRESHOLD=5 # Max 5% failures allowed +./fuzz-ci.sh + +# Or with direct parameters +FUZZ_ITERATIONS=100 ./fuzz-ci.sh + +ADVANCED: +-------- +# Export parameter type report +./report-config-types.sh --export + +# Run multiple fuzzers in parallel +./fuzz-configs.sh 50 "java -jar app.jar" & +./fuzz-configs.sh 50 "java -jar app.jar" & +wait + +# Custom timeout (edit fuzz-configs.sh, line ~245) +# Change: timeout 30s bash -c "$JAVA_CMD" +# To: timeout 60s bash -c "$JAVA_CMD" + +================================================================================ +WORKFLOW +================================================================================ + +1. SETUP + ----- + Install jq: brew install jq (macOS) or apt-get install jq (Linux) + Ensure all scripts are executable (chmod +x *.sh) + +2. TEST + ---- + Run example: ./example-fuzz.sh + Verify logs are created in fuzz-logs/ + +3. FUZZ + ---- + Run with your app: ./fuzz-configs.sh "" + Start with 10-20 iterations, scale up as needed + +4. ANALYZE + ------- + Review results: ./analyze-fuzz-logs.sh + Check individual logs in fuzz-logs/ for details + +5. ITERATE + ------- + Increase iterations for thorough testing + Monitor app metrics during fuzzing + Identify and fix configuration issues + +6. INTEGRATE + --------- + Add to CI/CD pipeline using fuzz-ci.sh + Set failure thresholds appropriate for your app + +================================================================================ +OUTPUT STRUCTURE +================================================================================ + +LOGS DIRECTORY: fuzz-logs/ +-------------------------- +Each run creates a log file: fuzz_run__.log + +Log file contents: + - Iteration number and timestamp + - Configuration parameters used + - Environment export commands + - Command executed + - Application output/errors + - Exit code (if failed) + +Example log: + # Fuzz Iteration 5 + # Timestamp: 20241128_143052 + # Configuration: + DD_TRACE_ENABLED=true + DD_SERVICE=my-service + DD_ENV=production + ... + + # Command: java -jar app.jar + ========================================== + [Application output here] + +================================================================================ +CONFIGURATION OPTIONS +================================================================================ + +In fuzz-configs.sh: +------------------ +MAX_PARAMS_PER_RUN=10 # Max parameters per iteration +LOG_DIR="./fuzz-logs" # Log directory location +timeout 30s # Timeout per run (line ~245) + +In fuzz-ci.sh: +------------- +FUZZ_ITERATIONS # Number of iterations (default: 20) +FUZZ_JAVA_CMD # Command to run (default: java -jar app.jar) +FUZZ_FAILURE_THRESHOLD # Max % failures allowed (default: 10) + +================================================================================ +PREREQUISITES +================================================================================ + +Required: + - Bash 4.0+ + - jq (JSON processor) + - timeout command (usually pre-installed) + +Optional: + - Java application with dd-java-agent + - CI/CD environment for automated testing + +Installation: + macOS: brew install jq + Ubuntu/Debian: sudo apt-get install jq + CentOS/RHEL: sudo yum install jq + +================================================================================ +TIPS FOR EFFECTIVE FUZZING +================================================================================ + +1. Start small (5-10 iterations) to verify setup +2. Gradually increase to 50-100+ for thorough testing +3. Monitor application logs and metrics during fuzzing +4. Use analyze-fuzz-logs.sh to identify failure patterns +5. Run overnight with 1000+ iterations for stress testing +6. Integrate into CI/CD for continuous testing +7. Document any configuration issues discovered +8. Share findings with your team +9. Keep fuzzer updated with new parameters +10. Run regularly, especially after configuration changes + +================================================================================ +STATISTICS +================================================================================ + +Configuration Parameters: 1,384 +Scripts Created: 5 +Documentation Files: 3 +Total Lines of Code: ~950 +Total Documentation: ~500 lines +Parameter Coverage: 100% of metadata/supported-configurations.json + +Type Detection Patterns: + - Boolean: 15+ patterns (ENABLED, DEBUG, COLLECT, etc.) + - Integer: 10+ patterns (PORT, TIMEOUT, SIZE, LIMIT, etc.) + - Float: 5+ patterns (RATE, SAMPLE_RATE, INTERVAL, etc.) + - String: 20+ specific patterns (ENV, SERVICE, HOST, URL, etc.) + +================================================================================ +SUPPORT & TROUBLESHOOTING +================================================================================ + +Common Issues: +------------- +1. "jq: command not found" → Install jq +2. "Permission denied" → chmod +x *.sh +3. All runs timeout → Increase timeout or check app startup +4. High failure rate → Review app logs, reduce parameters per run + +Documentation: +------------- +- FUZZ_QUICKSTART.md: Quick reference guide +- FUZZ_README.md: Comprehensive documentation +- Individual log files: Detailed run information + +Resources: +--------- +- DD-Trace-Java docs: https://docs.datadoghq.com/tracing/trace_collection/library_config/java/ +- Configuration reference: metadata/supported-configurations.json +- Source code: All scripts are well-commented + +================================================================================ +NEXT STEPS +================================================================================ + +1. ✓ Run ./example-fuzz.sh to verify installation +2. ✓ Review FUZZ_QUICKSTART.md for quick start +3. ✓ Test with your actual Java application +4. ✓ Analyze results with ./analyze-fuzz-logs.sh +5. ✓ Integrate into CI/CD with fuzz-ci.sh +6. ✓ Scale up iterations for thorough testing +7. ✓ Document and share findings +8. ✓ Run regularly for continuous validation + +================================================================================ +END OF SUMMARY +================================================================================ diff --git a/FUZZ_QUICKSTART.md b/FUZZ_QUICKSTART.md new file mode 100644 index 00000000000..697915bc3b5 --- /dev/null +++ b/FUZZ_QUICKSTART.md @@ -0,0 +1,236 @@ +# DD-Trace-Java Configuration Fuzzer - Quick Start Guide + +## What Was Created + +This fuzzing toolset helps you test your dd-trace-java application with randomized configurations to identify potential issues. + +### Files Created + +1. **`fuzz-configs.sh`** - Main fuzzer script + - Generates random but sensible configuration values + - Runs your app with different parameter combinations + - Logs all runs with full details + +2. **`analyze-fuzz-logs.sh`** - Log analyzer + - Analyzes fuzzing results + - Identifies failure patterns + - Provides statistics and recommendations + +3. **`example-fuzz.sh`** - Quick start example + - Demonstrates basic usage + - Checks prerequisites + - Runs a simple test + +4. **`FUZZ_README.md`** - Comprehensive documentation + - Detailed usage instructions + - Parameter type detection + - Troubleshooting guide + +## Quick Start + +### 1. Prerequisites + +Install `jq` (JSON processor): + +```bash +# macOS +brew install jq + +# Ubuntu/Debian +sudo apt-get install jq +``` + +### 2. Run Your First Test + +```bash +# Simple test (5 iterations with echo command) +./example-fuzz.sh + +# Or directly with your Java app: +./fuzz-configs.sh 10 "java -javaagent:dd-java-agent.jar -jar myapp.jar" +``` + +### 3. Analyze Results + +```bash +./analyze-fuzz-logs.sh +``` + +## Usage Examples + +### Basic Testing +```bash +# Run 10 iterations +./fuzz-configs.sh 10 "java -jar app.jar" +``` + +### With DD Agent +```bash +# Test with the datadog agent jar +./fuzz-configs.sh 20 "java -javaagent:./dd-java-agent/build/libs/dd-java-agent.jar -jar myapp.jar" +``` + +### Spring Boot Application +```bash +# Test Spring Boot app +./fuzz-configs.sh 15 "java -javaagent:dd-java-agent.jar -jar target/spring-app-1.0.0.jar" +``` + +### Custom Script +```bash +# Test with your startup script +./fuzz-configs.sh 30 "./start-my-app.sh" +``` + +## What the Fuzzer Does + +For each iteration, it: + +1. **Selects** 1-10 random configuration parameters (from 1384+ available) +2. **Generates** appropriate values based on parameter type: + - Booleans: `true`, `false`, `1`, `0` + - Ports: `1024-65535` + - Timeouts: `100-30000ms` + - Sample rates: `0.0-1.0` + - Strings: Realistic values (URLs, paths, service names, etc.) +3. **Runs** your application with those settings +4. **Logs** everything (config + output) +5. **Reports** success/failure/timeout + +## Understanding Output + +### During Run +``` +Iteration 5 of 10 +================================================================== +Selected 7 random parameters: + DD_TRACE_ENABLED = true + DD_SERVICE = my-service + DD_ENV = production + DD_AGENT_PORT = 8126 + DD_TRACE_SAMPLE_RATE = 0.75 + DD_PROFILING_ENABLED = true + DD_LOGS_INJECTION = false + +Running application... +✓ Iteration 5 completed successfully +``` + +### Summary +``` +Fuzzing Complete - Summary +================================================================== +Total iterations: 10 +Successful runs: 9 +Failed runs: 1 +Timeout runs: 0 +Logs directory: ./fuzz-logs +``` + +## Analyzing Logs + +### Check Individual Runs +```bash +# View a specific log +cat fuzz-logs/fuzz_run_5_20241128_143052.log + +# Find failed runs +grep -l "exit code" fuzz-logs/*.log +``` + +### Use the Analyzer +```bash +./analyze-fuzz-logs.sh +``` + +This shows: +- Success/failure statistics +- Most frequently used parameters +- Recent runs summary +- Recommendations + +## Configuration Types Detected + +The fuzzer intelligently detects parameter types: + +| Parameter Pattern | Generated Values | Examples | +|------------------|------------------|----------| +| `*_ENABLED`, `*_DEBUG` | Boolean | `true`, `false`, `1`, `0` | +| `*_PORT` | Port number | `1024-65535` | +| `*_TIMEOUT`, `*_DELAY` | Milliseconds | `100-30000` | +| `*_SAMPLE_RATE` | Float | `0.0-1.0` | +| `DD_ENV` | Environment | `production`, `staging`, `development` | +| `DD_SERVICE` | Service name | `my-service`, `web-app`, `api-gateway` | +| `*_HOST*` | Hostname | `localhost`, `127.0.0.1`, IPs | +| `*_URL`, `*_ENDPOINT` | URL | `http://localhost:8080`, etc. | +| `*_PATH`, `*_FILE` | Path | `/tmp/test`, `/var/log/app` | +| `*_KEY`, `*_TOKEN` | Hex string | Random hex | +| `*_TAGS` | Tag list | `key1:value1,key2:value2` | + +## Tips for Effective Fuzzing + +1. **Start Small**: Begin with 5-10 iterations to verify setup +2. **Increase Gradually**: Scale up to 50-100 iterations for thorough testing +3. **Monitor**: Watch app logs and metrics during fuzzing +4. **Analyze Failures**: Use `analyze-fuzz-logs.sh` to identify patterns +5. **CI/CD Integration**: Run fuzzing in your pipeline +6. **Long-Running**: Consider overnight fuzz runs with 1000+ iterations + +## Common Issues + +### "jq: command not found" +Install jq using your package manager (see Prerequisites) + +### All runs timeout +- Increase timeout in `fuzz-configs.sh` (search for `timeout 30s`) +- Check if your app is starting correctly +- Verify your command is correct + +### Permission denied +```bash +chmod +x fuzz-configs.sh analyze-fuzz-logs.sh example-fuzz.sh +``` + +### Want to test specific parameters +Edit `fuzz-configs.sh` and modify the parameter selection logic or create a focused test script + +## Next Steps + +1. ✅ Run `./example-fuzz.sh` to verify everything works +2. ✅ Test with your actual Java application +3. ✅ Analyze logs with `./analyze-fuzz-logs.sh` +4. ✅ Adjust parameters/iterations based on findings +5. ✅ Integrate into CI/CD pipeline +6. ✅ Document any configuration issues you discover + +## Advanced Usage + +### Parallel Testing +```bash +# Run multiple fuzzer instances +./fuzz-configs.sh 50 "java -jar app.jar" & +./fuzz-configs.sh 50 "java -jar app.jar" & +wait +``` + +### Custom Parameter Ranges +Edit `generate_integer()` or `generate_string()` functions in `fuzz-configs.sh` + +### Integration Test Mode +```bash +# Run with health check +./fuzz-configs.sh 20 "java -jar app.jar && curl http://localhost:8080/health" +``` + +## Support + +- See `FUZZ_README.md` for comprehensive documentation +- Check logs in `fuzz-logs/` for debugging +- Review dd-trace-java documentation at https://docs.datadoghq.com/tracing/trace_collection/library_config/java/ + +--- + +**Total Configurations Available**: 1384+ parameters from `metadata/supported-configurations.json` + +**Fuzzer Version**: 1.0.0 + diff --git a/FUZZ_README.md b/FUZZ_README.md new file mode 100644 index 00000000000..2eab3379eb8 --- /dev/null +++ b/FUZZ_README.md @@ -0,0 +1,237 @@ +# DD-Trace-Java Configuration Fuzzer + +A bash script that performs fuzz testing on dd-trace-java by generating random but sensible configuration combinations. + +## Overview + +This fuzzer automatically: +- Reads all available configuration parameters from `metadata/supported-configurations.json` +- Generates intelligent random values based on parameter name patterns +- Runs your Java application with different configuration combinations +- Logs all runs with their configurations and outcomes +- Provides detailed statistics at the end + +## Features + +- **Intelligent Value Generation**: The fuzzer analyzes parameter names to generate appropriate values: + - Boolean parameters (`ENABLED`, `DEBUG`, etc.) → `true`, `false`, `1`, `0` + - Port numbers → `1024-65535` + - Timeouts/delays → `100-30000ms` + - Sample rates → `0.0-1.0` + - URLs, paths, service names, tags, etc. with realistic values + +- **Configurable Parameters Per Run**: Maximum 10 parameters per run (configurable) +- **Comprehensive Logging**: Each run is logged with full configuration and output +- **Timeout Protection**: 30-second timeout per run to prevent hangs +- **Statistics**: Summary of successful/failed/timeout runs + +## Prerequisites + +- Bash 4.0+ +- `jq` (JSON processor) +- `timeout` command (usually pre-installed on Linux/macOS) + +Install jq if needed: +```bash +# macOS +brew install jq + +# Ubuntu/Debian +sudo apt-get install jq + +# CentOS/RHEL +sudo yum install jq +``` + +## Usage + +### Basic Usage + +```bash +./fuzz-configs.sh "" +``` + +### Examples + +#### Example 1: Test with a simple Java application +```bash +./fuzz-configs.sh 10 "java -javaagent:dd-java-agent/build/libs/dd-java-agent.jar -jar myapp.jar" +``` + +#### Example 2: Test with Spring Boot application +```bash +./fuzz-configs.sh 20 "java -javaagent:./dd-java-agent.jar -jar target/spring-boot-app.jar" +``` + +#### Example 3: Test with a script that starts your app +```bash +./fuzz-configs.sh 50 "./start-app.sh" +``` + +#### Example 4: Just print configurations (testing mode) +```bash +./fuzz-configs.sh 5 "echo 'Testing configuration'" +``` + +#### Example 5: Run with custom JVM options +```bash +./fuzz-configs.sh 15 "java -Xmx2g -javaagent:dd-java-agent.jar -jar app.jar" +``` + +## Output + +The fuzzer creates a `fuzz-logs` directory containing: +- Individual log files for each iteration +- Configuration used for each run +- Application output/errors +- Exit codes + +### Sample Log File Content + +``` +# Fuzz Iteration 1 +# Timestamp: 20241128_143052 +# Configuration: +DD_TRACE_ENABLED=true +DD_SERVICE=my-service +DD_ENV=production +DD_AGENT_PORT=8126 +DD_TRACE_SAMPLE_RATE=0.75 + +# Environment Exports: +export DD_TRACE_ENABLED='true' +export DD_SERVICE='my-service' +export DD_ENV='production' +export DD_AGENT_PORT='8126' +export DD_TRACE_SAMPLE_RATE='0.75' + +# Command: java -jar myapp.jar +========================================== + +[Application output here...] +``` + +## Configuration + +You can modify these variables in the script: + +```bash +MAX_PARAMS_PER_RUN=10 # Maximum parameters per iteration +LOG_DIR="./fuzz-logs" # Log directory +``` + +## Parameter Type Detection + +The fuzzer intelligently detects parameter types based on naming patterns: + +| Pattern | Type | Example Values | +|---------|------|----------------| +| `*ENABLED`, `*DEBUG` | Boolean | `true`, `false`, `1`, `0` | +| `*PORT` | Integer | `1024-65535` | +| `*TIMEOUT`, `*DELAY` | Integer (ms) | `100-30000` | +| `*SIZE`, `*LIMIT`, `*MAX*` | Integer | `10`, `100`, `1000`, `5000` | +| `*SAMPLE_RATE`, `*_RATE` | Float | `0.0-1.0` | +| `DD_ENV` | String | `production`, `staging`, `development` | +| `DD_SERVICE` | String | Service names | +| `*HOST*` | String | Hostnames/IPs | +| `*URL`, `*ENDPOINT` | String | URLs | +| `*PATH`, `*FILE` | String | File paths | +| `*KEY`, `*TOKEN` | String | Random hex strings | +| `*TAGS` | String | Comma-separated tags | +| `*PROPAGATION_STYLE` | String | `datadog`, `b3`, `tracecontext` | + +## Statistics Summary + +After all iterations, you'll see a summary like: + +``` +================================================================== + Fuzzing Complete - Summary +================================================================== +Total iterations: 50 +Successful runs: 45 +Failed runs: 3 +Timeout runs: 2 +Logs directory: ./fuzz-logs +``` + +## Exit Codes + +- `0`: All runs completed without failures +- `1`: One or more runs failed (check logs) + +## Tips + +1. **Start Small**: Begin with 5-10 iterations to ensure everything works +2. **Review Logs**: Check `fuzz-logs/` for any issues or unexpected behavior +3. **Adjust Timeout**: Modify the `timeout 30s` in the script if your app needs more time to start +4. **Continuous Testing**: Run this regularly in CI/CD to catch configuration issues early +5. **Combine with Monitoring**: Watch application metrics during fuzzing to catch subtle issues + +## Advanced Usage + +### Custom Value Ranges + +Edit the `generate_integer()` or `generate_string()` functions to customize value ranges for specific parameters. + +### Integration with CI/CD + +```bash +#!/bin/bash +# In your CI pipeline +if ! ./fuzz-configs.sh 100 "java -jar app.jar"; then + echo "Fuzz testing failed!" + exit 1 +fi +``` + +### Parallel Execution + +Run multiple fuzzer instances in parallel: + +```bash +./fuzz-configs.sh 50 "java -jar app.jar" & +./fuzz-configs.sh 50 "java -jar app.jar" & +wait +``` + +## Troubleshooting + +### Issue: "jq: command not found" +**Solution**: Install jq using your package manager (see Prerequisites) + +### Issue: Script hangs +**Solution**: The 30-second timeout should prevent this. If it persists, check your application's shutdown behavior. + +### Issue: All runs timeout +**Solution**: Increase the timeout value in the `run_fuzz_iteration()` function or check if your application is starting correctly. + +### Issue: Permission denied +**Solution**: Make sure the script is executable: `chmod +x fuzz-configs.sh` + +## Known Limitations + +- Some parameter combinations might not be compatible (e.g., conflicting settings) +- Generated values are random but may not cover all edge cases +- File paths and URLs may not point to actual resources +- Some configurations require specific formats not captured by simple pattern matching + +## Contributing + +To add support for new parameter types: + +1. Edit the `generate_value()` function +2. Add pattern matching for your parameter type +3. Implement value generation logic in the appropriate `generate_*()` function + +## License + +This script is part of the dd-trace-java project. Use according to the project's license. + +## Support + +For issues or questions: +- Check the logs in `fuzz-logs/` +- Review the dd-trace-java documentation +- Open an issue in the dd-trace-java repository + diff --git a/analyze-fuzz-logs.sh b/analyze-fuzz-logs.sh new file mode 100755 index 00000000000..2ac2e720cf5 --- /dev/null +++ b/analyze-fuzz-logs.sh @@ -0,0 +1,179 @@ +#!/bin/bash + +############################################################################## +# DD-Trace-Java Fuzzer Log Analyzer +# +# Analyzes fuzz test logs to identify patterns in failures and provide +# insights into which configurations might be causing issues. +############################################################################## + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOG_DIR="${SCRIPT_DIR}/fuzz-logs" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} DD-Trace-Java Fuzzer Log Analyzer${NC}" +echo -e "${BLUE}==================================================================${NC}" +echo "" + +# Check if log directory exists +if [ ! -d "$LOG_DIR" ]; then + echo -e "${RED}Error: Log directory not found: $LOG_DIR${NC}" + echo "Run the fuzzer first: ./fuzz-configs.sh " + exit 1 +fi + +# Count log files +LOG_COUNT=$(find "$LOG_DIR" -name "fuzz_run_*.log" | wc -l) + +if [ "$LOG_COUNT" -eq 0 ]; then + echo -e "${RED}No log files found in $LOG_DIR${NC}" + exit 1 +fi + +echo -e "${GREEN}Found $LOG_COUNT fuzz run logs${NC}" +echo "" + +############################################################################## +# Analyze runs +############################################################################## + +echo -e "${BLUE}Analyzing runs...${NC}" +echo "" + +successful_runs=0 +failed_runs=0 +total_params_used=() +all_params=() + +for log_file in "$LOG_DIR"/fuzz_run_*.log; do + # Check for success indicators in the log + if grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then + ((successful_runs++)) + else + ((failed_runs++)) + echo -e "${RED}Failed run: $(basename "$log_file")${NC}" + + # Extract and display the configuration that failed + echo -e "${YELLOW}Configuration:${NC}" + grep "^DD_" "$log_file" | grep -v "^#" | head -20 + echo "" + fi + + # Extract parameter names + params=$(grep "^DD_" "$log_file" | grep -v "^#" | cut -d'=' -f1) + for param in $params; do + all_params+=("$param") + done + + param_count=$(echo "$params" | wc -l) + total_params_used+=($param_count) +done + +############################################################################## +# Statistics +############################################################################## + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Statistics${NC}" +echo -e "${BLUE}==================================================================${NC}" + +echo -e "Total runs: ${BLUE}$LOG_COUNT${NC}" +echo -e "Successful runs: ${GREEN}$successful_runs${NC}" +echo -e "Failed runs: ${RED}$failed_runs${NC}" + +if [ $LOG_COUNT -gt 0 ]; then + success_rate=$((successful_runs * 100 / LOG_COUNT)) + echo -e "Success rate: ${GREEN}${success_rate}%${NC}" +fi + +echo "" + +############################################################################## +# Parameter frequency analysis +############################################################################## + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Most Frequently Used Parameters${NC}" +echo -e "${BLUE}==================================================================${NC}" + +if [ ${#all_params[@]} -gt 0 ]; then + # Count parameter occurrences + printf '%s\n' "${all_params[@]}" | sort | uniq -c | sort -rn | head -20 | while read count param; do + echo -e " ${GREEN}$count${NC} times: $param" + done +else + echo "No parameters found in logs" +fi + +echo "" + +############################################################################## +# Recommendations +############################################################################## + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Recommendations${NC}" +echo -e "${BLUE}==================================================================${NC}" + +if [ $failed_runs -eq 0 ]; then + echo -e "${GREEN}✓ All runs completed successfully!${NC}" + echo "" + echo "Consider:" + echo " - Increasing the number of iterations" + echo " - Testing with more parameters per run" + echo " - Running with your actual application under load" +elif [ $failed_runs -lt $((LOG_COUNT / 10)) ]; then + echo -e "${YELLOW}⚠ Less than 10% of runs failed${NC}" + echo "" + echo "Actions:" + echo " 1. Review the failed run logs above" + echo " 2. Check for common parameters across failures" + echo " 3. Consider if failures are due to incompatible parameter combinations" +else + echo -e "${RED}⚠ More than 10% of runs failed${NC}" + echo "" + echo "Urgent actions:" + echo " 1. Review application logs for errors" + echo " 2. Check if specific parameters are causing issues" + echo " 3. Verify the application can start with basic configurations" + echo " 4. Consider running with fewer parameters per iteration" +fi + +echo "" + +############################################################################## +# Recent runs +############################################################################## + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Recent Runs (last 5)${NC}" +echo -e "${BLUE}==================================================================${NC}" + +find "$LOG_DIR" -name "fuzz_run_*.log" -type f -print0 | \ + xargs -0 ls -t | head -5 | while read log_file; do + echo -e "${YELLOW}$(basename "$log_file")${NC}" + echo " Timestamp: $(grep "^# Timestamp:" "$log_file" | cut -d' ' -f3)" + echo " Parameters: $(grep -c "^DD_" "$log_file" | grep -v "^#")" + + # Check status + if grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then + echo -e " Status: ${GREEN}Success${NC}" + else + echo -e " Status: ${RED}Failed/Timeout${NC}" + fi + echo "" +done + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE}For detailed analysis, review individual logs in: $LOG_DIR${NC}" +echo -e "${BLUE}==================================================================${NC}" + diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index 542d73f7cf0..52eff743814 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -1,5 +1,6 @@ package datadog.trace.common.writer; +import com.antithesis.sdk.Assert; import datadog.communication.monitor.Monitoring; import datadog.communication.monitor.Recording; import datadog.communication.serialization.ByteBufferConsumer; @@ -107,15 +108,51 @@ public void accept(int messageCount, ByteBuffer buffer) { Payload payload = newPayload(messageCount, buffer); final int sizeInBytes = payload.sizeInBytes(); healthMetrics.onSerialize(sizeInBytes); + + // Antithesis: Track all send attempts + Assert.sometimes( + true, + "trace_payloads_being_sent", + java.util.Map.of( + "trace_count", messageCount, + "payload_size_bytes", sizeInBytes, + "dropped_traces_in_payload", payload.droppedTraces(), + "dropped_spans_in_payload", payload.droppedSpans() + ) + ); + RemoteApi.Response response = api.sendSerializedTraces(payload); mapper.reset(); if (response.success()) { + // Antithesis: Track successful sends + Assert.sometimes( + true, + "traces_sent_successfully", + java.util.Map.of( + "decision", "sent_success", + "trace_count", messageCount, + "payload_size_bytes", sizeInBytes, + "http_status", response.status() + ) + ); if (log.isDebugEnabled()) { log.debug("Successfully sent {} traces to the API", messageCount); } healthMetrics.onSend(messageCount, sizeInBytes, response); } else { + // Antithesis: Track failed sends + Assert.sometimes( + true, + "traces_failed_to_send", + java.util.Map.of( + "decision", "dropped_send_failed", + "trace_count", messageCount, + "payload_size_bytes", sizeInBytes, + "http_status", response.status(), + "has_exception", response.exception() != null + ) + ); if (log.isDebugEnabled()) { log.debug( "Failed to send {} traces of size {} bytes to the API", messageCount, sizeInBytes); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java index 90008cad0a0..d508bf86343 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java @@ -3,6 +3,7 @@ import static datadog.trace.api.sampling.PrioritySampling.UNSET; import static java.util.concurrent.TimeUnit.MINUTES; +import com.antithesis.sdk.Assert; import datadog.trace.core.DDSpan; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.relocate.api.RatelimitedLogger; @@ -68,6 +69,15 @@ protected RemoteWriter( @Override public void write(final List trace) { if (closed) { + // Antithesis: Track traces dropped during shutdown + Assert.sometimes( + true, + "trace_dropped_writer_closed", + java.util.Map.of( + "decision", "dropped_shutdown", + "span_count", trace.size() + ) + ); // We can't add events after shutdown otherwise it will never complete shutting down. log.debug("Dropped due to shutdown: {}", trace); handleDroppedTrace(trace); @@ -80,6 +90,17 @@ public void write(final List trace) { final int samplingPriority = root.samplingPriority(); switch (traceProcessingWorker.publish(root, samplingPriority, trace)) { case ENQUEUED_FOR_SERIALIZATION: + // Antithesis: Track traces enqueued for sending + Assert.sometimes( + true, + "trace_enqueued_for_send", + java.util.Map.of( + "decision", "enqueued", + "trace_id", root.getTraceId().toString(), + "span_count", trace.size(), + "sampling_priority", samplingPriority + ) + ); log.debug("Enqueued for serialization: {}", trace); healthMetrics.onPublish(trace, samplingPriority); break; @@ -87,10 +108,32 @@ public void write(final List trace) { log.debug("Enqueued for single span sampling: {}", trace); break; case DROPPED_BY_POLICY: + // Antithesis: Track traces dropped by policy + Assert.sometimes( + true, + "trace_dropped_by_policy", + java.util.Map.of( + "decision", "dropped_policy", + "trace_id", root.getTraceId().toString(), + "span_count", trace.size(), + "sampling_priority", samplingPriority + ) + ); log.debug("Dropped by the policy: {}", trace); handleDroppedTrace(trace); break; case DROPPED_BUFFER_OVERFLOW: + // Antithesis: Track traces dropped due to buffer overflow + Assert.sometimes( + true, + "trace_dropped_buffer_overflow", + java.util.Map.of( + "decision", "dropped_buffer_overflow", + "trace_id", root.getTraceId().toString(), + "span_count", trace.size(), + "sampling_priority", samplingPriority + ) + ); if (log.isDebugEnabled()) { log.debug("Dropped due to a buffer overflow: {}", trace); } else { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java index 28db0b6c99e..a6b188d1224 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java @@ -19,6 +19,8 @@ import static java.util.concurrent.TimeUnit.NANOSECONDS; import static java.util.concurrent.TimeUnit.SECONDS; +import com.antithesis.sdk.Assert; + import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import datadog.communication.ddagent.ExternalAgentLauncher; import datadog.communication.ddagent.SharedCommunicationObjects; @@ -1246,8 +1248,30 @@ void write(final List trace) { spanToSample.forceKeep(forceKeep); boolean published = forceKeep || traceCollector.sample(spanToSample); if (published) { + // Antithesis: Track traces accepted by sampling + Assert.sometimes( + true, + "trace_accepted_by_sampling", + java.util.Map.of( + "decision", "accepted", + "trace_id", writtenTrace.get(0).getTraceId().toString(), + "span_count", writtenTrace.size(), + "sampling_priority", spanToSample.samplingPriority() + ) + ); writer.write(writtenTrace); } else { + // Antithesis: Track traces dropped by sampling + Assert.sometimes( + true, + "trace_dropped_by_sampling", + java.util.Map.of( + "decision", "dropped_sampling", + "trace_id", writtenTrace.get(0).getTraceId().toString(), + "span_count", writtenTrace.size(), + "sampling_priority", spanToSample.samplingPriority() + ) + ); // with span streaming this won't work - it needs to be changed // to track an effective sampling rate instead, however, tests // checking that a hard reference on a continuation prevents diff --git a/example-export-only.sh b/example-export-only.sh new file mode 100755 index 00000000000..2823d8c5c5c --- /dev/null +++ b/example-export-only.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +############################################################################## +# Example: Using fuzz-configs.sh in export-only mode +# +# This script demonstrates how to call fuzz-configs.sh from another script +# and use the exported environment variables. +############################################################################## + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "======================================================================" +echo "Example: Using Fuzz Configs in Export-Only Mode" +echo "======================================================================" +echo "" + +# Enable export-only mode +export FUZZ_EXPORT_ONLY=true + +# Run the fuzzer once to export variables (no command execution) +echo "Step 1: Exporting random configuration variables..." +echo "----------------------------------------------------------------------" +source "${SCRIPT_DIR}/fuzz-configs.sh" 1 "" + +echo "" +echo "======================================================================" +echo "Step 2: Using the exported variables" +echo "======================================================================" +echo "" + +# Now you can use the exported variables +echo "Example 1: List all DD_ variables that were exported:" +env | grep "^DD_" | head -20 +echo "" + +echo "Example 2: Run your command with the exported variables:" +echo "java -javaagent:dd-java-agent.jar -jar myapp.jar" +echo "" + +echo "Example 3: Or run multiple commands with the same configuration:" +echo "----------------------------------------------------------------------" +echo "Command 1: Check configuration" +env | grep "^DD_" | wc -l +echo "Total DD_ variables exported: $(env | grep "^DD_" | wc -l)" + +echo "" +echo "Command 2: You can now run your Java application" +echo "(Skipping actual execution for this demo)" +# java -javaagent:dd-java-agent.jar -jar myapp.jar + +echo "" +echo "======================================================================" +echo "Complete!" +echo "======================================================================" +echo "" +echo "The exported variables remain in your shell environment until you" +echo "unset them or the shell session ends." +echo "" +echo "To use this pattern in your own script:" +echo " 1. Set: export FUZZ_EXPORT_ONLY=true" +echo " 2. Source: source ./fuzz-configs.sh 1 \"\"" +echo " 3. Use the DD_* environment variables as needed" +echo " 4. Run your Java application with the exported configs" +echo "" + diff --git a/example-fuzz.sh b/example-fuzz.sh new file mode 100755 index 00000000000..6770024d6d3 --- /dev/null +++ b/example-fuzz.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +############################################################################## +# Example script showing how to use the dd-trace-java fuzzer +# This creates a minimal test application for demonstration +############################################################################## + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "DD-Trace-Java Fuzzer - Quick Start Example" +echo "==========================================" +echo "" + +# Check if fuzz-configs.sh exists +if [ ! -f "${SCRIPT_DIR}/fuzz-configs.sh" ]; then + echo "Error: fuzz-configs.sh not found in ${SCRIPT_DIR}" + exit 1 +fi + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is not installed. Please install it first:" + echo " macOS: brew install jq" + echo " Ubuntu/Debian: sudo apt-get install jq" + echo " CentOS/RHEL: sudo yum install jq" + exit 1 +fi + +echo "Running fuzzer with 5 test iterations..." +echo "" +echo "This will generate random dd-trace-java configurations and" +echo "run a simple echo command to demonstrate the fuzzer." +echo "" +echo "For real testing, replace the command with your Java application:" +echo " ./fuzz-configs.sh 10 'java -javaagent:dd-java-agent.jar -jar myapp.jar'" +echo "" +echo "Starting in 3 seconds..." +sleep 3 + +# Run the fuzzer with a simple echo command for demonstration +"${SCRIPT_DIR}/fuzz-configs.sh" 5 "echo 'Test run completed with above configuration'" + +echo "" +echo "==========================================" +echo "Example completed!" +echo "" +echo "Check the fuzz-logs/ directory for detailed logs of each run." +echo "" +echo "Next steps:" +echo "1. Review FUZZ_README.md for detailed documentation" +echo "2. Run with your actual Java application" +echo "3. Analyze the logs to identify any configuration issues" +echo "" + diff --git a/example-use-export-vars.sh b/example-use-export-vars.sh new file mode 100755 index 00000000000..461e7ecd00e --- /dev/null +++ b/example-use-export-vars.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +############################################################################## +# Example: Using fuzz-export-vars.sh to export random configurations +# +# This demonstrates how to export DD configuration variables from another +# script and use them to run your Java application. +############################################################################## + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "======================================================================" +echo "Example: Export Random DD Configurations" +echo "======================================================================" +echo "" + +echo "Step 1: Export random configuration variables" +echo "----------------------------------------------------------------------" + +# Export variables using eval +eval "$(${SCRIPT_DIR}/fuzz-export-vars.sh)" + +echo "" +echo "======================================================================" +echo "Step 2: View exported variables" +echo "======================================================================" +echo "" + +# Show all exported DD_ variables +echo "Exported DD_ configuration variables:" +env | grep "^DD_" | sort +echo "" +echo "Total: $(env | grep "^DD_" | wc -l | tr -d " ") DD_ variables exported" +echo "" + +echo "======================================================================" +echo "Step 3: Run your Java application" +echo "======================================================================" +echo "" + +echo "Now you can run your Java application with these configurations:" +echo "" +echo " java -javaagent:dd-java-agent.jar -jar myapp.jar" +echo "" +echo "Or any other command that uses DD_ environment variables." +echo "" + +echo "======================================================================" +echo "Additional Examples" +echo "======================================================================" +echo "" + +echo "1. Export and run immediately:" +echo " eval \"\$(./fuzz-export-vars.sh)\" && java -jar myapp.jar" +echo "" + +echo "2. Export specific number of parameters:" +echo " FUZZ_MAX_PARAMS=5 eval \"\$(./fuzz-export-vars.sh)\"" +echo "" + +echo "3. Use in a loop for multiple test runs:" +echo " for i in {1..10}; do" +echo " unset \$(env | grep '^DD_' | cut -d'=' -f1)" +echo " eval \"\$(./fuzz-export-vars.sh)\"" +echo " java -jar myapp.jar" +echo " done" +echo "" + +echo "======================================================================" +echo "Complete!" +echo "======================================================================" + diff --git a/fuzz-ci.sh b/fuzz-ci.sh new file mode 100755 index 00000000000..4f310893c6b --- /dev/null +++ b/fuzz-ci.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +############################################################################## +# DD-Trace-Java Fuzzer - CI/CD Integration Script +# +# This script can be used in CI/CD pipelines to run configuration fuzzing +# as part of automated testing. +############################################################################## + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Configuration - adjust these for your CI environment +ITERATIONS="${FUZZ_ITERATIONS:-20}" +JAVA_CMD="${FUZZ_JAVA_CMD:-java -jar app.jar}" +FAILURE_THRESHOLD="${FUZZ_FAILURE_THRESHOLD:-10}" # Maximum % of failures allowed + +# Colors (disabled in non-interactive mode) +if [ -t 1 ]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +echo "==========================================" +echo "DD-Trace-Java Fuzzer - CI/CD Mode" +echo "==========================================" +echo "" +echo "Configuration:" +echo " Iterations: $ITERATIONS" +echo " Command: $JAVA_CMD" +echo " Failure threshold: ${FAILURE_THRESHOLD}%" +echo "" + +# Check prerequisites +if ! command -v jq &> /dev/null; then + echo -e "${RED}Error: jq is not installed${NC}" + echo "Install it before running this script:" + echo " Ubuntu/Debian: apt-get install jq" + echo " CentOS/RHEL: yum install jq" + echo " macOS: brew install jq" + exit 1 +fi + +if [ ! -f "${SCRIPT_DIR}/fuzz-configs.sh" ]; then + echo -e "${RED}Error: fuzz-configs.sh not found in ${SCRIPT_DIR}${NC}" + exit 1 +fi + +# Run fuzzer +echo "Starting fuzz testing..." +echo "" + +if "${SCRIPT_DIR}/fuzz-configs.sh" "$ITERATIONS" "$JAVA_CMD"; then + FUZZ_EXIT_CODE=0 +else + FUZZ_EXIT_CODE=$? +fi + +echo "" +echo "==========================================" +echo "Analyzing Results" +echo "==========================================" +echo "" + +# Analyze results +LOG_DIR="${SCRIPT_DIR}/fuzz-logs" +if [ ! -d "$LOG_DIR" ]; then + echo -e "${RED}Error: Log directory not found${NC}" + exit 1 +fi + +# Count successes and failures +TOTAL_LOGS=$(find "$LOG_DIR" -name "fuzz_run_*.log" | wc -l) +SUCCESSFUL_RUNS=0 +FAILED_RUNS=0 + +for log_file in "$LOG_DIR"/fuzz_run_*.log; do + if grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then + ((SUCCESSFUL_RUNS++)) + else + ((FAILED_RUNS++)) + fi +done + +if [ "$TOTAL_LOGS" -gt 0 ]; then + FAILURE_RATE=$((FAILED_RUNS * 100 / TOTAL_LOGS)) +else + FAILURE_RATE=0 +fi + +echo "Results:" +echo " Total runs: $TOTAL_LOGS" +echo " Successful: $SUCCESSFUL_RUNS" +echo " Failed: $FAILED_RUNS" +echo " Failure rate: ${FAILURE_RATE}%" +echo "" + +# Check against threshold +if [ "$FAILURE_RATE" -gt "$FAILURE_THRESHOLD" ]; then + echo -e "${RED}✗ FAILED: Failure rate (${FAILURE_RATE}%) exceeds threshold (${FAILURE_THRESHOLD}%)${NC}" + echo "" + echo "Failed runs:" + for log_file in "$LOG_DIR"/fuzz_run_*.log; do + if ! grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then + echo " - $(basename "$log_file")" + echo " Configuration:" + grep "^DD_" "$log_file" | grep -v "^#" | head -5 | sed 's/^/ /' + fi + done + echo "" + echo "For detailed analysis, review logs in: $LOG_DIR" + exit 1 +else + echo -e "${GREEN}✓ PASSED: Failure rate (${FAILURE_RATE}%) is within threshold (${FAILURE_THRESHOLD}%)${NC}" + + if [ "$FAILED_RUNS" -gt 0 ]; then + echo "" + echo -e "${YELLOW}Note: $FAILED_RUNS run(s) failed but within acceptable threshold${NC}" + fi + + exit 0 +fi + diff --git a/fuzz-configs.sh b/fuzz-configs.sh new file mode 100755 index 00000000000..3aa058d682a --- /dev/null +++ b/fuzz-configs.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +############################################################################## +# DD-Trace-Java Configuration Fuzzer +# +# This script generates random but sensible configuration values for +# dd-trace-java and runs your application with them for testing. +# +# Usage: ./fuzz-configs.sh +# +# Example: ./fuzz-configs.sh 10 "java -jar myapp.jar" +# +# Export-Only Mode: +# Set FUZZ_EXPORT_ONLY=true to only export variables without running command. +# This allows you to source the script from another script and use the vars. +# +# Example: FUZZ_EXPORT_ONLY=true source ./fuzz-configs.sh 1 "" +############################################################################## + +set -e + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CONFIG_FILE="${SCRIPT_DIR}/metadata/supported-configurations.json" +MAX_PARAMS_PER_RUN=10 +LOG_DIR="${SCRIPT_DIR}/fuzz-logs" +ITERATIONS="${1:-5}" +JAVA_CMD="${2:-echo 'No Java command specified. Using echo for testing'}" + +# Export-only mode: if set to "true", only exports variables without running command +EXPORT_ONLY_MODE="${FUZZ_EXPORT_ONLY:-false}" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Create log directory +mkdir -p "$LOG_DIR" + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} DD-Trace-Java Configuration Fuzzer${NC}" +echo -e "${BLUE}==================================================================${NC}" +echo "" + +# Extract all configuration keys from the JSON file +if [ ! -f "$CONFIG_FILE" ]; then + echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}" + exit 1 +fi + +echo -e "${YELLOW}Extracting configuration parameters...${NC}" +CONFIGS=($(jq -r '.supportedConfigurations | keys[]' "$CONFIG_FILE")) +TOTAL_CONFIGS=${#CONFIGS[@]} +echo -e "${GREEN}Found $TOTAL_CONFIGS configuration parameters${NC}" +echo "" + +############################################################################## +# Function to generate a random boolean value +############################################################################## +generate_boolean() { + local values=("true" "false" "1" "0") + echo "${values[$((RANDOM % ${#values[@]}))]}" +} + +############################################################################## +# Function to generate a random integer +############################################################################## +generate_integer() { + local param_name="$1" + + # Analyze parameter name for hints about range + if [[ "$param_name" =~ PORT ]]; then + echo $((1024 + RANDOM % 64512)) # Port range: 1024-65535 + elif [[ "$param_name" =~ TIMEOUT|DELAY ]]; then + echo $((100 + RANDOM % 30000)) # Timeout: 100-30000ms + elif [[ "$param_name" =~ SIZE|LIMIT|MAX|DEPTH ]]; then + local max_values=(10 50 100 500 1000 5000 10000) + echo "${max_values[$((RANDOM % ${#max_values[@]}))]}" + elif [[ "$param_name" =~ COUNT|NUM ]]; then + echo $((1 + RANDOM % 100)) # Count: 1-100 + elif [[ "$param_name" =~ RATE|PERCENT ]]; then + echo $((RANDOM % 101)) # Rate: 0-100 + else + echo $((RANDOM % 1000)) # Default: 0-999 + fi +} + +############################################################################## +# Function to generate a random float/rate +############################################################################## +generate_float() { + local param_name="$1" + + if [[ "$param_name" =~ RATE|SAMPLE ]]; then + # Sample rates typically 0.0-1.0 + echo "0.$((RANDOM % 100))" + elif [[ "$param_name" =~ INTERVAL ]]; then + # Intervals can be larger + echo "$((1 + RANDOM % 60)).$((RANDOM % 100))" + else + echo "$((RANDOM % 100)).$((RANDOM % 100))" + fi +} + +############################################################################## +# Function to generate a random string value +############################################################################## +generate_string() { + local param_name="$1" + + # Analyze parameter name for appropriate string type + if [[ "$param_name" =~ ^DD_ENV$ ]]; then + local envs=("production" "staging" "development" "test" "qa") + echo "${envs[$((RANDOM % ${#envs[@]}))]}" + + elif [[ "$param_name" =~ ^DD_SERVICE$ ]]; then + local services=("my-service" "web-app" "api-gateway" "microservice-${RANDOM}") + echo "${services[$((RANDOM % ${#services[@]}))]}" + + elif [[ "$param_name" =~ ^DD_VERSION$ ]]; then + echo "v$((1 + RANDOM % 3)).$((RANDOM % 10)).$((RANDOM % 20))" + + elif [[ "$param_name" =~ HOST|HOSTNAME ]]; then + local hosts=("localhost" "127.0.0.1" "agent.local" "192.168.1.100" "datadog-agent") + echo "${hosts[$((RANDOM % ${#hosts[@]}))]}" + + elif [[ "$param_name" =~ URL|ENDPOINT|URI ]]; then + local urls=("http://localhost:8080" "https://api.example.com" "http://127.0.0.1:9000" "https://agent.datadoghq.com") + echo "${urls[$((RANDOM % ${#urls[@]}))]}" + + elif [[ "$param_name" =~ PATH|FILE|DIR ]]; then + local paths=("/tmp/test" "/var/log/app" "/opt/datadog" "./config" "/etc/datadog") + echo "${paths[$((RANDOM % ${#paths[@]}))]}" + + elif [[ "$param_name" =~ KEY|TOKEN ]]; then + # Generate random hex string + echo "$(head -c 16 /dev/urandom | xxd -p -c 32)" + + elif [[ "$param_name" =~ LEVEL ]]; then + local levels=("DEBUG" "INFO" "WARN" "ERROR" "TRACE" "OFF") + echo "${levels[$((RANDOM % ${#levels[@]}))]}" + + elif [[ "$param_name" =~ MODE ]]; then + local modes=("full" "service" "disabled" "safe" "extended") + echo "${modes[$((RANDOM % ${#modes[@]}))]}" + + elif [[ "$param_name" =~ TAGS$ ]]; then + local tag_count=$((1 + RANDOM % 3)) + local tags=() + for ((i=0; i "$log_file" + + # Export all selected config variables using the same values + for param in "${!param_values[@]}"; do + export "${param}=${param_values[$param]}" + done + + # If in export-only mode, skip running the command + local exit_code=0 + if [ "$EXPORT_ONLY_MODE" = "true" ]; then + echo "" + echo -e "${GREEN}✓ Variables exported successfully (export-only mode)${NC}" + echo -e "${YELLOW}Note: Variables are exported in the current shell environment${NC}" + else + # Set environment variables and run command + echo "" + echo -e "${YELLOW}Running application...${NC}" + + # Run the command with timeout + if timeout 30s bash -c "$JAVA_CMD" >> "$log_file" 2>&1; then + echo -e "${GREEN}✓ Iteration $iteration completed successfully${NC}" + else + exit_code=$? + if [ $exit_code -eq 124 ]; then + echo -e "${YELLOW}⚠ Iteration $iteration timed out (30s limit)${NC}" + else + echo -e "${RED}✗ Iteration $iteration failed with exit code: $exit_code${NC}" + fi + fi + + # Clean up environment variables after running + for idx in "${selected_indices[@]}"; do + local param="${CONFIGS[$idx]}" + unset "$param" + done + fi + + echo -e "${BLUE}Log saved to: $log_file${NC}" + echo "" + + return $exit_code +} + +############################################################################## +# Main execution +############################################################################## + +# Validate iterations parameter +if ! [[ "$ITERATIONS" =~ ^[0-9]+$ ]] || [ "$ITERATIONS" -lt 1 ]; then + echo -e "${RED}Error: Invalid iterations count: $ITERATIONS${NC}" + echo "Usage: $0 " + exit 1 +fi + +echo -e "${YELLOW}Starting fuzzer with $ITERATIONS iterations${NC}" +echo -e "${YELLOW}Maximum $MAX_PARAMS_PER_RUN parameters per run${NC}" +if [ "$EXPORT_ONLY_MODE" = "true" ]; then + echo -e "${YELLOW}Mode: Export-only (variables will be exported, command will not run)${NC}" +else + echo -e "${YELLOW}Java command: $JAVA_CMD${NC}" +fi +echo "" + +# Track statistics +successful_runs=0 +failed_runs=0 +timeout_runs=0 + +# Run iterations +for ((i=1; i<=ITERATIONS; i++)); do + run_fuzz_iteration $i + exit_code=$? + + if [ $exit_code -eq 0 ]; then + ((successful_runs++)) + elif [ $exit_code -eq 124 ]; then + ((timeout_runs++)) + else + ((failed_runs++)) + fi + + # Brief pause between iterations + if [ $i -lt $ITERATIONS ]; then + sleep 2 + fi +done + +# Print summary +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Fuzzing Complete - Summary${NC}" +echo -e "${BLUE}==================================================================${NC}" +echo -e "Total iterations: ${BLUE}$ITERATIONS${NC}" +echo -e "Successful runs: ${GREEN}$successful_runs${NC}" +echo -e "Failed runs: ${RED}$failed_runs${NC}" +echo -e "Timeout runs: ${YELLOW}$timeout_runs${NC}" +echo -e "Logs directory: ${BLUE}$LOG_DIR${NC}" +echo "" + +if [ $failed_runs -gt 0 ]; then + echo -e "${RED}⚠ Some runs failed. Check logs for details.${NC}" + exit 1 +else + echo -e "${GREEN}✓ All runs completed without failures!${NC}" + exit 0 +fi + diff --git a/fuzz-export-vars.sh b/fuzz-export-vars.sh new file mode 100755 index 00000000000..429720d5d3c --- /dev/null +++ b/fuzz-export-vars.sh @@ -0,0 +1,161 @@ +#!/bin/bash + +############################################################################## +# DD-Trace-Java Configuration Fuzzer - Export Generator +# +# This script generates export statements for random dd-trace-java +# configuration parameters. Use it with eval to export variables. +# +# Usage: eval "$(./fuzz-export-vars.sh)" +# +# Example: +# eval "$(./fuzz-export-vars.sh)" +# java -javaagent:dd-java-agent.jar -jar myapp.jar +############################################################################## + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CONFIG_FILE="${SCRIPT_DIR}/metadata/supported-configurations.json" +MAX_PARAMS="${FUZZ_MAX_PARAMS:-10}" + +# Extract all configuration keys +if [ ! -f "$CONFIG_FILE" ]; then + echo "# Error: Configuration file not found: $CONFIG_FILE" >&2 + exit 1 +fi + +CONFIGS=($(jq -r '.supportedConfigurations | keys[]' "$CONFIG_FILE" 2>/dev/null)) +TOTAL_CONFIGS=${#CONFIGS[@]} + +if [ "$TOTAL_CONFIGS" -eq 0 ]; then + echo "# Error: No configurations found" >&2 + exit 1 +fi + +############################################################################## +# Value generation functions (same as fuzz-configs.sh) +############################################################################## + +generate_boolean() { + local values=("true" "false" "1" "0") + echo "${values[$((RANDOM % ${#values[@]}))]}" +} + +generate_integer() { + local param_name="$1" + if [[ "$param_name" =~ PORT ]]; then + echo $((1024 + RANDOM % 64512)) + elif [[ "$param_name" =~ TIMEOUT|DELAY ]]; then + echo $((100 + RANDOM % 30000)) + elif [[ "$param_name" =~ SIZE|LIMIT|MAX|DEPTH ]]; then + local max_values=(10 50 100 500 1000 5000 10000) + echo "${max_values[$((RANDOM % ${#max_values[@]}))]}" + elif [[ "$param_name" =~ COUNT|NUM ]]; then + echo $((1 + RANDOM % 100)) + elif [[ "$param_name" =~ RATE|PERCENT ]]; then + echo $((RANDOM % 101)) + else + echo $((RANDOM % 1000)) + fi +} + +generate_float() { + local param_name="$1" + if [[ "$param_name" =~ RATE|SAMPLE ]]; then + echo "0.$((RANDOM % 100))" + elif [[ "$param_name" =~ INTERVAL ]]; then + echo "$((1 + RANDOM % 60)).$((RANDOM % 100))" + else + echo "$((RANDOM % 100)).$((RANDOM % 100))" + fi +} + +generate_string() { + local param_name="$1" + + if [[ "$param_name" =~ ^DD_ENV$ ]]; then + local envs=("production" "staging" "development" "test" "qa") + echo "${envs[$((RANDOM % ${#envs[@]}))]}" + elif [[ "$param_name" =~ ^DD_SERVICE$ ]]; then + local services=("my-service" "web-app" "api-gateway" "microservice-${RANDOM}") + echo "${services[$((RANDOM % ${#services[@]}))]}" + elif [[ "$param_name" =~ ^DD_VERSION$ ]]; then + echo "v$((1 + RANDOM % 3)).$((RANDOM % 10)).$((RANDOM % 20))" + elif [[ "$param_name" =~ HOST|HOSTNAME ]]; then + local hosts=("localhost" "127.0.0.1" "agent.local" "192.168.1.100" "datadog-agent") + echo "${hosts[$((RANDOM % ${#hosts[@]}))]}" + elif [[ "$param_name" =~ URL|ENDPOINT|URI ]]; then + local urls=("http://localhost:8080" "https://api.example.com" "http://127.0.0.1:9000") + echo "${urls[$((RANDOM % ${#urls[@]}))]}" + elif [[ "$param_name" =~ PATH|FILE|DIR ]]; then + local paths=("/tmp/test" "/var/log/app" "/opt/datadog" "./config") + echo "${paths[$((RANDOM % ${#paths[@]}))]}" + elif [[ "$param_name" =~ KEY|TOKEN ]]; then + echo "$(head -c 16 /dev/urandom | xxd -p -c 32)" + elif [[ "$param_name" =~ LEVEL ]]; then + local levels=("DEBUG" "INFO" "WARN" "ERROR" "TRACE") + echo "${levels[$((RANDOM % ${#levels[@]}))]}" + elif [[ "$param_name" =~ MODE ]]; then + local modes=("full" "service" "disabled" "safe") + echo "${modes[$((RANDOM % ${#modes[@]}))]}" + elif [[ "$param_name" =~ TAGS$ ]]; then + echo "key1:value${RANDOM},key2:value${RANDOM}" + elif [[ "$param_name" =~ PROPAGATION_STYLE ]]; then + local styles=("datadog" "b3" "tracecontext" "datadog,b3") + echo "${styles[$((RANDOM % ${#styles[@]}))]}" + else + local generic=("test-value" "example" "config-${RANDOM}" "auto" "default") + echo "${generic[$((RANDOM % ${#generic[@]}))]}" + fi +} + +generate_value() { + local param_name="$1" + + if [[ "$param_name" =~ ENABLED$|^DD_TRACE_ENABLED$|DEBUG$|COLLECT|HEADER_COLLECTION|REPORTING|SPLIT_BY ]]; then + generate_boolean + elif [[ "$param_name" =~ PORT$|TIMEOUT$|DELAY$|SIZE$|LIMIT$|MAX_|DEPTH$|COUNT$|QUEUE_SIZE$|BUFFER ]]; then + generate_integer "$param_name" + elif [[ "$param_name" =~ SAMPLE_RATE$|_RATE$ ]] && [[ ! "$param_name" =~ TRACE_RATE_LIMIT ]]; then + generate_float "$param_name" + elif [[ "$param_name" =~ INTERVAL$ ]]; then + if [[ "$param_name" =~ FLUSH_INTERVAL ]]; then + generate_float "$param_name" + else + generate_integer "$param_name" + fi + else + generate_string "$param_name" + fi +} + +############################################################################## +# Generate export statements +############################################################################## + +# Determine number of parameters +num_params=$((1 + RANDOM % MAX_PARAMS)) + +# Select random parameters +selected_indices=() +while [ ${#selected_indices[@]} -lt $num_params ]; do + idx=$((RANDOM % TOTAL_CONFIGS)) + if [[ ! " ${selected_indices[@]} " =~ " ${idx} " ]]; then + selected_indices+=($idx) + fi +done + +# Output comment header (to stderr so it doesn't affect eval) +echo "# Exporting $num_params random DD configuration parameters..." >&2 + +# Generate export statements +for idx in "${selected_indices[@]}"; do + param="${CONFIGS[$idx]}" + value=$(generate_value "$param") + # Output the export statement + echo "export ${param}='${value}'" + # Log to stderr + echo "# ${param}=${value}" >&2 +done + +echo "# Export complete!" >&2 + diff --git a/report-config-types.sh b/report-config-types.sh new file mode 100755 index 00000000000..47acc646087 --- /dev/null +++ b/report-config-types.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +############################################################################## +# DD-Trace-Java Configuration Type Reporter +# +# Analyzes all configuration parameters and reports their detected types +# based on naming patterns used by the fuzzer. +############################################################################## + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CONFIG_FILE="${SCRIPT_DIR}/metadata/supported-configurations.json" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} DD-Trace-Java Configuration Type Report${NC}" +echo -e "${BLUE}==================================================================${NC}" +echo "" + +# Check if config file exists +if [ ! -f "$CONFIG_FILE" ]; then + echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}" + exit 1 +fi + +# Extract all configuration keys +CONFIGS=($(jq -r '.supportedConfigurations | keys[]' "$CONFIG_FILE")) +TOTAL=${#CONFIGS[@]} + +echo -e "${GREEN}Analyzing $TOTAL configuration parameters...${NC}" +echo "" + +# Initialize counters +boolean_count=0 +integer_count=0 +float_count=0 +string_count=0 + +boolean_params=() +integer_params=() +float_params=() +string_params=() + +############################################################################## +# Classify each parameter +############################################################################## + +for param in "${CONFIGS[@]}"; do + # Determine type based on parameter name patterns (same logic as fuzzer) + if [[ "$param" =~ ENABLED$|^DD_TRACE_ENABLED$|DEBUG$|COLLECT|HEADER_COLLECTION|REPORTING|SPLIT_BY ]]; then + ((boolean_count++)) + boolean_params+=("$param") + elif [[ "$param" =~ PORT$|TIMEOUT$|DELAY$|SIZE$|LIMIT$|MAX_|DEPTH$|COUNT$|QUEUE_SIZE$|BUFFER ]]; then + ((integer_count++)) + integer_params+=("$param") + elif [[ "$param" =~ SAMPLE_RATE$|_RATE$ ]] && [[ ! "$param" =~ TRACE_RATE_LIMIT ]]; then + ((float_count++)) + float_params+=("$param") + elif [[ "$param" =~ INTERVAL$ ]]; then + if [[ "$param" =~ FLUSH_INTERVAL ]]; then + ((float_count++)) + float_params+=("$param") + else + ((integer_count++)) + integer_params+=("$param") + fi + else + ((string_count++)) + string_params+=("$param") + fi +done + +############################################################################## +# Display Summary +############################################################################## + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Type Distribution${NC}" +echo -e "${BLUE}==================================================================${NC}" + +echo -e "${GREEN}Boolean parameters:${NC} $boolean_count ($(( boolean_count * 100 / TOTAL ))%)" +echo -e "${GREEN}Integer parameters:${NC} $integer_count ($(( integer_count * 100 / TOTAL ))%)" +echo -e "${GREEN}Float parameters:${NC} $float_count ($(( float_count * 100 / TOTAL ))%)" +echo -e "${GREEN}String parameters:${NC} $string_count ($(( string_count * 100 / TOTAL ))%)" +echo "" + +############################################################################## +# Display samples +############################################################################## + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Sample Parameters by Type${NC}" +echo -e "${BLUE}==================================================================${NC}" + +echo -e "${YELLOW}Boolean Parameters (sample of ${boolean_count}):${NC}" +printf '%s\n' "${boolean_params[@]}" | head -10 +if [ ${#boolean_params[@]} -gt 10 ]; then + echo " ... and $((boolean_count - 10)) more" +fi +echo "" + +echo -e "${YELLOW}Integer Parameters (sample of ${integer_count}):${NC}" +printf '%s\n' "${integer_params[@]}" | head -10 +if [ ${#integer_params[@]} -gt 10 ]; then + echo " ... and $((integer_count - 10)) more" +fi +echo "" + +echo -e "${YELLOW}Float Parameters (sample of ${float_count}):${NC}" +printf '%s\n' "${float_params[@]}" | head -10 +if [ ${#float_params[@]} -gt 10 ]; then + echo " ... and $((float_count - 10)) more" +fi +echo "" + +echo -e "${YELLOW}String Parameters (sample of ${string_count}):${NC}" +printf '%s\n' "${string_params[@]}" | head -20 +if [ ${#string_params[@]} -gt 20 ]; then + echo " ... and $((string_count - 20)) more" +fi +echo "" + +############################################################################## +# Export options +############################################################################## + +if [ "$1" = "--export" ]; then + OUTPUT_FILE="${SCRIPT_DIR}/config-types-report.json" + + echo -e "${BLUE}Exporting to JSON: $OUTPUT_FILE${NC}" + + jq -n \ + --arg total "$TOTAL" \ + --arg boolean_count "$boolean_count" \ + --arg integer_count "$integer_count" \ + --arg float_count "$float_count" \ + --arg string_count "$string_count" \ + --argjson boolean "$(printf '%s\n' "${boolean_params[@]}" | jq -R . | jq -s .)" \ + --argjson integer "$(printf '%s\n' "${integer_params[@]}" | jq -R . | jq -s .)" \ + --argjson float "$(printf '%s\n' "${float_params[@]}" | jq -R . | jq -s .)" \ + --argjson string "$(printf '%s\n' "${string_params[@]}" | jq -R . | jq -s .)" \ + '{ + total: $total, + summary: { + boolean: $boolean_count, + integer: $integer_count, + float: $float_count, + string: $string_count + }, + parameters: { + boolean: $boolean, + integer: $integer, + float: $float, + string: $string + } + }' > "$OUTPUT_FILE" + + echo -e "${GREEN}Report exported successfully!${NC}" +fi + +echo -e "${BLUE}==================================================================${NC}" +echo "" +echo "To export this report as JSON, run:" +echo " $0 --export" +echo "" + From 136a7663f33562c4fb9589f20dbbd0cc65763be4 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 17 Dec 2025 16:13:14 +0100 Subject: [PATCH 11/15] Add documentation for trace loss tracking implementation --- TRACE_LOSS_TRACKING.md | 195 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 TRACE_LOSS_TRACKING.md diff --git a/TRACE_LOSS_TRACKING.md b/TRACE_LOSS_TRACKING.md new file mode 100644 index 00000000000..4d4abf38649 --- /dev/null +++ b/TRACE_LOSS_TRACKING.md @@ -0,0 +1,195 @@ +# Trace Loss Tracking with Antithesis Assertions + +## Overview + +This document describes the simplified Antithesis assertion strategy implemented to track trace loss in dd-trace-java. + +## Implementation + +Assertions were added at 3 strategic points in the trace pipeline to provide complete visibility into where and why traces are lost: + +### 1. CoreTracer.write() - Sampling Decision Point + +**Location:** `dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java` + +**Purpose:** Track traces at the sampling decision point + +**Assertions:** +- `trace_accepted_by_sampling` - Traces that passed sampling and will be sent +- `trace_dropped_by_sampling` - Traces dropped due to sampling decision + +**Data Captured:** +- `decision`: "accepted" or "dropped_sampling" +- `trace_id`: Unique trace identifier +- `span_count`: Number of spans in the trace +- `sampling_priority`: Sampling priority value + +### 2. RemoteWriter.write() - Buffer Acceptance Point + +**Location:** `dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java` + +**Purpose:** Track traces at buffer acceptance and detect drops due to overflow or policy + +**Assertions:** +- `trace_enqueued_for_send` - Traces successfully enqueued for serialization +- `trace_dropped_buffer_overflow` - Traces dropped due to full buffer +- `trace_dropped_by_policy` - Traces dropped by policy rules +- `trace_dropped_writer_closed` - Traces dropped during shutdown + +**Data Captured:** +- `decision`: "enqueued", "dropped_buffer_overflow", "dropped_policy", or "dropped_shutdown" +- `trace_id`: Unique trace identifier (when available) +- `span_count`: Number of spans in the trace +- `sampling_priority`: Sampling priority value (when available) + +### 3. PayloadDispatcherImpl.accept() - HTTP Send Point + +**Location:** `dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java` + +**Purpose:** Track actual HTTP sends to the agent and detect failures + +**Assertions:** +- `trace_payloads_being_sent` - All send attempts (before HTTP call) +- `traces_sent_successfully` - Traces successfully sent to agent +- `traces_failed_to_send` - Traces that failed to send via HTTP + +**Data Captured:** +- `decision`: "sent_success" or "dropped_send_failed" +- `trace_count`: Number of traces in the payload +- `payload_size_bytes`: Size of the payload in bytes +- `http_status`: HTTP response status code +- `dropped_traces_in_payload`: Count of traces already dropped before this send +- `dropped_spans_in_payload`: Count of spans already dropped before this send +- `has_exception`: Whether an exception occurred (for failures) + +## Complete Trace Flow + +``` +Application → CoreTracer.write() + ↓ + [ASSERTION POINT 1: Sampling] + ↓ ↓ + published=true published=false + ↓ ↓ + ✅ trace_accepted_by_sampling ❌ trace_dropped_by_sampling + ↓ + RemoteWriter.write() + ↓ + [ASSERTION POINT 2: Buffer Acceptance] + ↓ + traceProcessingWorker.publish() + ↓ + ✅ trace_enqueued_for_send + OR + ❌ trace_dropped_buffer_overflow + ❌ trace_dropped_by_policy + ❌ trace_dropped_writer_closed + ↓ + TraceProcessingWorker (batching) + ↓ + PayloadDispatcherImpl.accept() + ↓ + [ASSERTION POINT 3: HTTP Send] + ↓ + 🔵 trace_payloads_being_sent + ↓ + api.sendSerializedTraces() + ↓ ↓ + response.success() !response.success() + ↓ ↓ + ✅ traces_sent_successfully ❌ traces_failed_to_send +``` + +## Metrics Available After Antithesis Testing + +After running Antithesis tests, you will be able to calculate: + +### Total Traces Processed +``` +Total = trace_accepted_by_sampling + trace_dropped_by_sampling +``` + +### Total Traces Lost +``` +Lost = trace_dropped_by_sampling + + trace_dropped_buffer_overflow + + trace_dropped_by_policy + + trace_dropped_writer_closed + + traces_failed_to_send +``` + +### Total Traces Successfully Sent +``` +Success = traces_sent_successfully +``` + +### Loss Rate +``` +Loss Rate = (Total Traces Lost / Total Traces Processed) * 100% +``` + +### Loss Breakdown by Cause +- **Sampling Loss:** `trace_dropped_by_sampling / Total Traces Processed` +- **Buffer Overflow Loss:** `trace_dropped_buffer_overflow / Total Traces Processed` +- **Policy Loss:** `trace_dropped_by_policy / Total Traces Processed` +- **Shutdown Loss:** `trace_dropped_writer_closed / Total Traces Processed` +- **Send Failure Loss:** `traces_failed_to_send / Total Traces Processed` + +## Assertion Properties + +All assertions use `Assert.sometimes()` which means: +- They track that the condition occurred at least once during testing +- They provide detailed context about each occurrence +- They don't fail the test (they're for tracking, not validation) + +## Benefits of This Approach + +1. **Clear Tracking:** Each assertion has a unique, descriptive name +2. **Complete Coverage:** Tracks the entire pipeline from sampling to agent +3. **Detailed Context:** Captures relevant metadata at each point +4. **Easy Analysis:** Simple math to calculate loss rates and breakdown +5. **Actionable Data:** Identifies exactly where and why traces are lost + +## Example Analysis + +After an Antithesis test run, you might see: + +``` +trace_accepted_by_sampling: 10,000 occurrences +trace_dropped_by_sampling: 90,000 occurrences +trace_enqueued_for_send: 10,000 occurrences +trace_dropped_buffer_overflow: 50 occurrences +traces_sent_successfully: 9,950 occurrences +traces_failed_to_send: 0 occurrences +``` + +**Analysis:** +- Total traces: 100,000 +- Sampling rate: 10% (10,000 accepted / 100,000 total) +- Buffer overflow: 0.05% (50 / 100,000) +- Send success rate: 99.5% (9,950 / 10,000 accepted) +- Overall success rate: 9.95% (9,950 / 100,000 total) + +**Conclusion:** +- Sampling is working as expected (90% drop rate) +- Very low buffer overflow (0.05%) +- Excellent send success rate (99.5%) +- No HTTP failures + +## Dependencies + +- **Antithesis SDK:** `com.antithesis:sdk:1.4.5` (already configured in `dd-trace-core/build.gradle`) +- The SDK is bundled in the tracer JAR and has minimal performance impact in production + +## Running Antithesis Tests + +Contact the Antithesis team or refer to their documentation for running tests with these assertions enabled. + +## Future Enhancements + +Potential improvements: +1. Add `Assert.always()` for critical paths that should never fail +2. Add `Assert.unreachable()` for error paths that should never occur +3. Track additional metadata (e.g., service names, operation names) +4. Add time-based metrics (latency, throughput) + From db2d08ccda480986f84976488ad6ccf47d447dd3 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 17 Dec 2025 16:33:31 +0100 Subject: [PATCH 12/15] Remove all old Antithesis assertions from profiling, debugger, remote-config, and telemetry Removed assertions from: - telemetry/DependencyResolver.java - profiling-uploader/ProfileUploader.java - profiling/ProfilingAgent.java - profiling-ddprof/DatadogProfiler.java - profiling-controller-openjdk/OpenJdkController.java - profiling-controller/ProfilingSystem.java - agent-debugger/ConfigurationUpdater.java - remote-config-core/DefaultConfigurationPoller.java Also removed obsolete documentation and scripts. Now only the new trace loss tracking assertions remain in: - CoreTracer.write() - RemoteWriter.write() - PayloadDispatcherImpl.accept() --- ANTITHESIS_ASSERTIONS.md | 666 ------------------ EXPORT_VARIABLES_GUIDE.md | 282 -------- FUZZER_SUMMARY.txt | 346 --------- FUZZ_QUICKSTART.md | 236 ------- FUZZ_README.md | 237 ------- TRACE_LOSS_TRACKING.md | 195 ----- analyze-fuzz-logs.sh | 179 ----- .../debugger/agent/ConfigurationUpdater.java | 9 - .../controller/openjdk/OpenJdkController.java | 3 - .../profiling/controller/ProfilingSystem.java | 11 - .../profiling/ddprof/DatadogProfiler.java | 13 - .../profiling/uploader/ProfileUploader.java | 5 - .../profiling/agent/ProfilingAgent.java | 7 - example-export-only.sh | 65 -- example-fuzz.sh | 54 -- example-use-export-vars.sh | 72 -- fuzz-ci.sh | 133 ---- fuzz-configs.sh | 374 ---------- fuzz-export-vars.sh | 161 ----- .../DefaultConfigurationPoller.java | 9 - report-config-types.sh | 174 ----- .../dependency/DependencyResolver.java | 15 - 22 files changed, 3246 deletions(-) delete mode 100644 ANTITHESIS_ASSERTIONS.md delete mode 100644 EXPORT_VARIABLES_GUIDE.md delete mode 100644 FUZZER_SUMMARY.txt delete mode 100644 FUZZ_QUICKSTART.md delete mode 100644 FUZZ_README.md delete mode 100644 TRACE_LOSS_TRACKING.md delete mode 100755 analyze-fuzz-logs.sh delete mode 100755 example-export-only.sh delete mode 100755 example-fuzz.sh delete mode 100755 example-use-export-vars.sh delete mode 100755 fuzz-ci.sh delete mode 100755 fuzz-configs.sh delete mode 100755 fuzz-export-vars.sh delete mode 100755 report-config-types.sh diff --git a/ANTITHESIS_ASSERTIONS.md b/ANTITHESIS_ASSERTIONS.md deleted file mode 100644 index f10304b9010..00000000000 --- a/ANTITHESIS_ASSERTIONS.md +++ /dev/null @@ -1,666 +0,0 @@ -# Antithesis Assertions in dd-trace-java - -This document describes the Antithesis assertions added to track trace loss, API sending failures, and telemetry data loss. - -## Overview - -Antithesis assertions have been added to multiple classes in the trace writing pipeline and telemetry system to detect when traces/telemetry are lost or fail to send to the API. These assertions help ensure the reliability of trace collection, telemetry reporting, and transmission at every stage of the process. - -## Added Assertions - -### Overview by Location - -**Telemetry System:** -- **TelemetryClient** - Monitors telemetry HTTP requests, failures, and network issues -- **TelemetryRouter** - Tracks routing failures and endpoint failover - -**Trace System:** -- **DDAgentApi** - Monitors agent communication, HTTP responses, and network failures -- **PayloadDispatcherImpl** - Tracks trace sending to the API and pre-send drops -- **RemoteWriter** - Tracks buffer overflow and shutdown scenarios - ---- - -## TelemetryClient Assertions (Telemetry Sending Layer) - -### T1. Telemetry Activity Tracking (`reachable` assertion) - -**Location:** `TelemetryClient.sendHttpRequest()` method (line 102) - -**Property:** `"Telemetry sending is exercised"` - -**Type:** `Assert.reachable()` - -**Purpose:** Verifies that telemetry sending code is being exercised during testing. - ---- - -### T2. Telemetry Success Validation (`always` assertion) 🔴 **CRITICAL** - -**Location:** `TelemetryClient.sendHttpRequest()` method, success path (line 153) - -**Property:** `"Telemetry requests should always succeed - no telemetry data should be lost"` - -**Type:** `Assert.always()` - -**Purpose:** Asserts that ALL telemetry requests should succeed. When this fails, it indicates that **telemetry data is being dropped** instead of being retried or buffered. - -**Details Captured:** -- `request_type`: Type of telemetry request (app-started, app-closing, etc.) -- `http_status`: HTTP response code -- `http_message`: HTTP status message -- `url`: Telemetry endpoint URL -- `success`: Whether request succeeded - -**The Problem This Detects:** -Your warning message: `"Got FAILURE sending telemetry request"` - indicates telemetry data is being **dropped** without retry mechanism. - ---- - -### T3. Telemetry HTTP Failure Detection (`unreachable` assertion) 🔴 - -**Location:** `TelemetryClient.sendHttpRequest()` method, non-success response (line 140) - -**Property:** `"Telemetry HTTP request failed - telemetry data should not be dropped, should retry"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Marks the HTTP failure path as unreachable, indicating telemetry data loss. **This is the exact issue you're experiencing** - failures cause data to be dropped instead of retried. - -**Details Captured:** -- `request_type`: Type of telemetry request -- `http_status`: Error status code -- `http_message`: Error message -- `url`: Endpoint URL -- `reason`: "http_error_response" - ---- - -### T4. Telemetry Network Exception Prevention (`unreachable` assertion) 🔴 - -**Location:** `TelemetryClient.sendHttpRequest()` method, IOException catch (line 171) - -**Property:** `"Telemetry network/IO failure - telemetry data should not be dropped, should retry"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Marks network failures as unreachable. When triggered, indicates telemetry data is being lost due to connectivity issues **without retry**. - -**Details Captured:** -- `request_type`: Type of telemetry request -- `exception_type`: Exception class name -- `exception_message`: Exception details -- `url`: Endpoint URL -- `reason`: "network_io_exception" - ---- - -### T5. Telemetry 404 Tracking (`sometimes` assertion) - -**Location:** `TelemetryClient.sendHttpRequest()` method, 404 response (line 122) - -**Property:** `"Telemetry endpoint returns 404 - endpoint may be disabled"` - -**Type:** `Assert.sometimes()` - -**Purpose:** Tracks when telemetry endpoint is disabled (404). This may be acceptable in some configurations. - -**Details Captured:** -- `request_type`: Type of telemetry request -- `url`: Endpoint URL -- `reason`: "endpoint_disabled_404" - ---- - -## TelemetryRouter Assertions (Telemetry Routing Layer) - -### T6. Telemetry Routing Success (`always` assertion) 🔴 **CRITICAL** - -**Location:** `TelemetryRouter.sendRequest()` method (line 56) - -**Property:** `"Telemetry routing should always succeed - failures indicate data loss without retry mechanism"` - -**Type:** `Assert.always()` - -**Purpose:** Validates that telemetry routing succeeds. This is the **top-level** assertion that catches all telemetry failures and proves that **current failures result in data loss**. - -**Details Captured:** -- `result`: SUCCESS, FAILURE, NOT_FOUND, or INTERRUPTED -- `current_client`: "agent" or "intake" -- `request_failed`: Boolean -- `has_fallback`: Whether fallback client exists -- `url`: Current endpoint URL - ---- - -### T7. Agent Telemetry Failover Tracking (`unreachable` assertion) 🔴 - -**Location:** `TelemetryRouter.sendRequest()` method, agent failure (line 70) - -**Property:** `"Agent telemetry endpoint failed - switching to intake but current request data is lost"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Tracks when agent telemetry fails and router switches to intake. **Critical:** The current request data is LOST during this failover - only future requests go to intake. - -**Details Captured:** -- `result`: Failure result type -- `url`: Agent endpoint URL -- `has_intake_fallback`: Whether intake fallback is available -- `reason`: "agent_telemetry_failure" - ---- - -### T8. Intake Telemetry Failover Tracking (`unreachable` assertion) 🔴 - -**Location:** `TelemetryRouter.sendRequest()` method, intake failure (line 90) - -**Property:** `"Intake telemetry endpoint failed - switching to agent but current request data is lost"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Tracks when intake telemetry fails and router switches back to agent. **Critical:** The current request data is LOST during this failover. - -**Details Captured:** -- `result`: Failure result type -- `url`: Intake endpoint URL -- `will_fallback_to_agent`: Boolean -- `reason`: "intake_telemetry_failure" - ---- - -## DDAgentApi Assertions (Agent Communication Layer) - -### 1. Agent API Activity Tracking (`reachable` + `sometimes` assertions) - -**Location:** `DDAgentApi.sendSerializedTraces()` method start (line 97-100) - -**Properties:** -- `"DDAgentApi trace sending is exercised"` (reachable) -- `"Traces are being sent through DDAgentApi"` (sometimes) - -**Type:** `Assert.reachable()` + `Assert.sometimes()` - -**Purpose:** Verifies that the DDAgentApi code path is being exercised and traces are flowing through the agent API layer. - ---- - -### 2. Agent Detection Validation (`unreachable` assertion) 🔴 - -**Location:** `DDAgentApi.sendSerializedTraces()` method, agent detection failure (line 107) - -**Property:** `"Datadog agent should always be detected - agent communication failure"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Asserts that the Datadog agent should always be discoverable. If the agent cannot be detected, traces will be lost with a 404 error. - -**Details Captured:** -- `trace_count`: Number of traces that cannot be sent -- `payload_size_bytes`: Size of the payload -- `agent_url`: The agent URL being contacted -- `failure_reason`: "agent_not_detected" - -**When This Occurs:** -- Agent is not running -- Agent is unreachable (network/firewall issues) -- Incorrect agent URL configuration -- Agent discovery mechanism failure - ---- - -### 3. HTTP Response Success Validation (`always` assertion) 🔴 **CRITICAL** - -**Location:** `DDAgentApi.sendSerializedTraces()` method, after HTTP call (line 149) - -**Property:** `"HTTP response from Datadog agent should always be 200 - API communication failure"` - -**Type:** `Assert.always()` - -**Purpose:** Validates that every HTTP response from the agent is successful (200 OK). This is the primary assertion for detecting API-level failures. - -**Details Captured:** -- `trace_count`: Number of traces being sent -- `payload_size_bytes`: Size of the payload -- `http_status`: HTTP status code received -- `http_message`: HTTP status message -- `success`: Boolean indicating if status is 200 -- `agent_url`: Full URL of the traces endpoint - -**When This Fails:** -- Agent returns error status codes (400, 413, 500, 503, etc.) -- Authentication/authorization failures -- Agent overload or resource exhaustion -- Malformed requests - ---- - -### 4. HTTP Error Path Unreachability (`unreachable` assertion) 🔴 - -**Location:** `DDAgentApi.sendSerializedTraces()` method, non-200 response branch (line 163) - -**Property:** `"Non-200 HTTP response from agent indicates API failure - traces may be lost"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Marks the non-200 response code path as unreachable. When reached, indicates traces are being rejected by the agent. - -**Details Captured:** -- `trace_count`: Number of traces rejected -- `payload_size_bytes`: Size of rejected payload -- `http_status`: Error status code -- `http_message`: Error message from agent -- `failure_reason`: "http_error_response" - -**Common Status Codes:** -- 400: Bad Request (malformed payload) -- 413: Payload Too Large -- 429: Too Many Requests (rate limiting) -- 500: Internal Server Error -- 503: Service Unavailable (agent overloaded) - ---- - -### 5. Network Exception Prevention (`unreachable` assertion) 🔴 - -**Location:** `DDAgentApi.sendSerializedTraces()` method, IOException catch block (line 199) - -**Property:** `"Network/IO exceptions should not occur when sending to agent - indicates connectivity issues"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Asserts that network/IO exceptions should never occur when communicating with the agent. These indicate infrastructure or connectivity problems. - -**Details Captured:** -- `trace_count`: Number of traces that failed to send -- `payload_size_bytes`: Size of the payload -- `exception_type`: Full class name of the exception -- `exception_message`: Exception message -- `agent_url`: Agent URL being contacted -- `failure_reason`: "network_io_exception" - -**When This Occurs:** -- Network connectivity issues -- Connection timeouts -- DNS resolution failures -- Socket errors -- SSL/TLS handshake failures - ---- - -## PayloadDispatcherImpl Assertions (Trace Serialization Layer) - -### 6. Payload Dispatcher Activity Tracking (`reachable` + `sometimes` assertions) - -**Location:** `PayloadDispatcherImpl.accept()` method (line 110-113) - -**Properties:** -- `"Trace sending code path is exercised"` (reachable) -- `"Traces are being sent to the API"` (sometimes) - -**Type:** `Assert.reachable()` + `Assert.sometimes()` - -**Purpose:** Verifies that the PayloadDispatcher code path is being exercised and traces are flowing through. - ---- - -### 7. Trace Sending Success (`always` assertion) - -**Location:** `PayloadDispatcherImpl.accept()` method (line 136) - -**Property:** `"Trace sending to API should always succeed - no traces should be lost"` - -**Type:** `Assert.always()` - -**Purpose:** Asserts that every trace sending attempt should succeed. If this assertion fails, it indicates that traces are being lost due to API failures. - -**Details Captured:** -- `trace_count`: Number of traces in the payload -- `payload_size_bytes`: Size of the payload in bytes -- `success`: Whether the send was successful -- `exception`: Exception class name (if present) -- `exception_message`: Exception message (if present) -- `http_status`: HTTP response status code (if present) - -### 8. Send Failure Path (`unreachable` assertion) - -**Location:** `PayloadDispatcherImpl.accept()` method, failure branch (line 159) - -**Property:** `"Trace sending failure path should never be reached - indicates traces are being lost"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Marks the failure path as something that should never occur. When this path is reached, it indicates traces are being lost due to send failures. - -**Details Captured:** -- `trace_count`: Number of traces that failed to send -- `payload_size_bytes`: Size of failed payload -- `exception`: Exception class name (if present) -- `exception_message`: Exception message (if present) -- `http_status`: HTTP response status code (if present) - -### 9. Trace Drop Prevention (`unreachable` assertion) - -**Location:** `PayloadDispatcherImpl.onDroppedTrace()` method (line 69) - -**Property:** `"Traces should not be dropped before attempting to send - indicates buffer overflow or backpressure"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Asserts that traces should never be dropped before even attempting to send them. Drops indicate buffer overflow, backpressure, or resource exhaustion. - -**Details Captured:** -- `span_count`: Number of spans in the dropped trace -- `total_dropped_traces`: Cumulative count of dropped traces -- `total_dropped_spans`: Cumulative count of dropped spans - ---- - -## RemoteWriter Assertions (Buffer and Lifecycle Layer) - -### 10. Writer State Validation (`always` assertion) 🔴 **CRITICAL** - -**Location:** `RemoteWriter.write()` method, start of method (line 79) - -**Property:** `"Writer should never be closed when attempting to write traces"` - -**Type:** `Assert.always()` - -**Purpose:** Proactively validates that the writer is in a valid state (not closed) whenever traces are being written. This assertion catches improper usage where traces are written after shutdown or during shutdown race conditions. This is a **preventive assertion** that checks every write attempt. - -**Details Captured:** -- `writer_closed`: Boolean indicating if writer is closed -- `trace_size`: Number of traces being written -- `has_traces`: Whether the trace list is non-empty - -**When This Fails:** -- Application attempts to write traces after calling `close()` -- Race condition between shutdown and trace generation -- Improper lifecycle management -- Indicates a bug in the calling code or shutdown sequencing - -**Importance:** This is a critical assertion because writing to a closed writer indicates a fundamental problem with lifecycle management that could lead to: -- Lost traces during shutdown -- Inconsistent application state -- Potential resource leaks - ---- - -### 11. Buffer Overflow Detection (`unreachable` assertion) 🔴 **CRITICAL** - -**Location:** `RemoteWriter.write()` method, DROPPED_BUFFER_OVERFLOW case (line 117) - -**Property:** `"Buffer overflow should never occur - traces are being dropped due to backpressure"` - -**Type:** `Assert.unreachable()` - -**Purpose:** Asserts that buffer overflow should NEVER happen. This indicates that traces are being generated faster than they can be processed and serialized, resulting in dropped traces. This is a critical issue that indicates system overload or insufficient buffer capacity. - -**Details Captured:** -- `trace_size`: Number of traces being dropped -- `span_count`: Total number of spans in the dropped traces -- `sampling_priority`: Sampling priority of the trace -- `buffer_capacity`: Current buffer capacity -- `reason`: "buffer_overflow_backpressure" - -**When This Occurs:** -- Internal processing queue is full (primary, secondary, or span sampling queue) -- Traces are being generated faster than serialization can occur -- System is under heavy load or experiencing backpressure -- Buffer size may be insufficient for the workload - ---- - -### 12. Shutdown Trace Drop Tracking (`sometimes` assertion) - -**Location:** `RemoteWriter.write()` method, closed writer case (line 94) - -**Property:** `"Traces are dropped due to writer shutdown - tracking shutdown behavior"` - -**Type:** `Assert.sometimes()` - -**Purpose:** Tracks when traces are dropped because the writer has been shut down. This helps understand shutdown behavior and whether traces are being lost during application shutdown sequences. - -**Details Captured:** -- `trace_size`: Number of traces being dropped -- `span_count`: Total number of spans in the dropped traces -- `reason`: "writer_closed_during_shutdown" - -**When This Occurs:** -- Application is shutting down -- Writer.close() has been called -- Traces are still being generated after shutdown initiated -- Can indicate timing issues in shutdown sequences - -## How Antithesis Uses These Assertions - -When running under Antithesis testing: - -1. **Property Aggregation:** All assertions with the same `message` are aggregated into a single test property in the triage report. - -2. **Failure Detection:** - - `always()` assertions that evaluate to `false` will flag the property as failing - - `unreachable()` assertions that are reached will flag the property as failing - - `sometimes()` assertions that never evaluate to `true` will flag the property as failing - -3. **Exploration Guidance:** Antithesis uses these assertions as hints to explore states that might trigger failures, making bug detection more efficient. - -4. **Non-Terminating:** Unlike traditional assertions, Antithesis assertions do not terminate the program when they fail. This allows Antithesis to potentially escalate the failure into more severe bugs. - -## Expected Behavior - -### In a Healthy System - -**Telemetry System:** -- ✅ `"Telemetry sending is exercised"` - Should pass (reached at least once) -- ✅ `"Telemetry requests should always succeed"` - Should pass (all succeed) 🔴 **CRITICAL** -- ✅ `"Telemetry HTTP request failed - should retry"` - Should pass (never reached) 🔴 -- ✅ `"Telemetry network/IO failure - should retry"` - Should pass (never reached) 🔴 -- ✅ `"Telemetry routing should always succeed"` - Should pass (all succeed) 🔴 **CRITICAL** -- ✅ `"Agent telemetry endpoint failed"` - Should pass (never reached) 🔴 -- ✅ `"Intake telemetry endpoint failed"` - Should pass (never reached) 🔴 -- ℹ️ `"Telemetry endpoint returns 404"` - May occur if endpoint disabled - -**Trace DDAgentApi Layer:** -- ✅ `"DDAgentApi trace sending is exercised"` - Should pass (reached at least once) -- ✅ `"Traces are being sent through DDAgentApi"` - Should pass (reached at least once) -- ✅ `"Datadog agent should always be detected"` - Should pass (agent always detectable) 🔴 -- ✅ `"HTTP response from Datadog agent should always be 200"` - Should pass (all responses 200) 🔴 **CRITICAL** -- ✅ `"Non-200 HTTP response from agent indicates API failure"` - Should pass (never reached) 🔴 -- ✅ `"Network/IO exceptions should not occur"` - Should pass (never reached) 🔴 - -**PayloadDispatcherImpl Layer:** -- ✅ `"Trace sending code path is exercised"` - Should pass (reached at least once) -- ✅ `"Traces are being sent to the API"` - Should pass (reached at least once) -- ✅ `"Trace sending to API should always succeed"` - Should pass (all sends succeed) -- ✅ `"Trace sending failure path should never be reached"` - Should pass (never reached) -- ✅ `"Traces should not be dropped before attempting to send"` - Should pass (never reached) - -**RemoteWriter Layer:** -- ✅ `"Writer should never be closed when attempting to write traces"` - Should pass (writer always open) 🔴 **CRITICAL** -- ✅ `"Buffer overflow should never occur"` - Should pass (never reached) 🔴 **CRITICAL** -- ℹ️ `"Traces are dropped due to writer shutdown"` - May or may not occur depending on shutdown timing - -### When Telemetry Is Lost ⚠️ **YOUR ISSUE** - -If telemetry is being lost (your current issue with `"Got FAILURE sending telemetry request"`), you'll see these failures: - -**Telemetry HTTP/Network Failures:** -- ❌ `"Telemetry requests should always succeed"` - Will fail on any telemetry failure 🔴 **CRITICAL** - - This is the top-level assertion proving telemetry data loss - - Shows request type, HTTP status, and endpoint -- ❌ `"Telemetry HTTP request failed - should retry"` - Will fail when HTTP errors occur 🔴 - - Indicates telemetry dropped due to HTTP errors (5xx, 4xx) - - Shows status code and error message -- ❌ `"Telemetry network/IO failure - should retry"` - Will fail on connectivity issues 🔴 - - Indicates telemetry dropped due to network problems - - Shows exception type and message - -**Telemetry Routing Failures:** -- ❌ `"Telemetry routing should always succeed"` - Will fail when routing fails 🔴 **CRITICAL** - - Proves current implementation drops data instead of retrying - - Shows current client (agent/intake) and failure details -- ❌ `"Agent telemetry endpoint failed - current request data is lost"` - Will fail when agent endpoint fails 🔴 - - Router switches to intake but **current request is dropped** - - Shows whether fallback is available -- ❌ `"Intake telemetry endpoint failed - current request data is lost"` - Will fail when intake endpoint fails 🔴 - - Router switches to agent but **current request is dropped** - - Future requests use new endpoint, but current data is lost - -**Key Finding:** The assertions prove that when telemetry fails, the **current request is DROPPED** - the router only changes the endpoint for **future** requests. This is why you see `"Got FAILURE"` warnings - there's no retry or buffering mechanism. - ---- - -### When Traces Are Lost - -If traces are being lost, you'll see failures in the triage report: - -**Agent Communication Failures (DDAgentApi):** -- ❌ `"Datadog agent should always be detected"` - Will fail if agent is unreachable 🔴 - - Indicates agent not running, network issues, or configuration problems - - Provides agent URL and detection details -- ❌ `"HTTP response from Datadog agent should always be 200"` - Will fail on any error status 🔴 **CRITICAL** - - Shows HTTP status code, message, and agent URL - - Indicates agent overload, rate limiting, or request errors -- ❌ `"Non-200 HTTP response from agent indicates API failure"` - Will fail when agent rejects traces 🔴 - - Provides HTTP status codes (400, 413, 429, 500, 503, etc.) -- ❌ `"Network/IO exceptions should not occur"` - Will fail on network errors 🔴 - - Shows exception type and message - - Indicates connectivity, timeout, or DNS issues - -**API Send Failures (PayloadDispatcherImpl):** -- ❌ `"Trace sending to API should always succeed"` - Will fail with details about failed sends -- ❌ `"Trace sending failure path should never be reached"` - Will fail, showing this path was reached - -**Buffer/Queue Issues:** -- ❌ `"Buffer overflow should never occur"` - Will fail if backpressure causes drops 🔴 **CRITICAL** - - Indicates system overload or insufficient buffer capacity - - Provides buffer capacity and trace details -- ❌ `"Traces should not be dropped before attempting to send"` - May fail if drops occur in PayloadDispatcher - -**Lifecycle/Shutdown Issues:** -- ❌ `"Writer should never be closed when attempting to write traces"` - Will fail if traces written to closed writer 🔴 **CRITICAL** - - Indicates race condition in shutdown sequence - - Shows improper lifecycle management - - Provides details about writer state and trace being written -- ⚠️ `"Traces are dropped due to writer shutdown"` - Will show in report if shutdown timing causes trace loss - - Helps identify if shutdown sequence needs improvement - - May be acceptable depending on shutdown strategy - - Works in conjunction with the writer state validation above - -The `details` captured in failed assertions will provide diagnostic information including trace counts, payload sizes, exceptions, HTTP status codes, buffer capacity, and sampling priority. - -## Dependencies - -- **Antithesis SDK:** `com.antithesis:sdk:1.4.5` (bundled in tracer JAR) - [Available on Maven Central](https://repo1.maven.org/maven2/com/antithesis/sdk/) -- **Jackson:** Already available transitively in the project - -### Bundled SDK - -The Antithesis SDK is configured as an `implementation` dependency, which means: - -- ✅ **Bundled in final JAR** - SDK classes included in the dd-trace-java agent -- ✅ **Always available** - No ClassNotFoundException at runtime -- ✅ **Works everywhere** - Assertions compiled and available in all environments - -### Using Antithesis Assertions - -The Antithesis SDK (version 1.4.5) is publicly available on Maven Central and is bundled with the tracer. - -**In normal runtime (production/development):** -- Assertions are present in the code but have **minimal performance impact** -- According to [Antithesis documentation](https://antithesis.com/docs/properties_assertions/assertions/), the SDK is designed to run safely in production -- Assertions become no-ops when not running in Antithesis environment - -**In Antithesis testing environment:** -- Antithesis runtime automatically detects and evaluates all assertions -- Generates triage reports showing which properties passed/failed -- Provides detailed bug reports with reproducible scenarios -- Contact Antithesis at [antithesis.com](https://antithesis.com) for access to their testing platform - -## Complete Pipeline Coverage Summary - -The assertions provide comprehensive coverage across telemetry and trace pipelines: - -### Telemetry Pipeline - -``` -Application Telemetry Events - ↓ -[TelemetryRouter] ← Assertions T6-T8 - • Routing success validation - • Agent failover tracking - • Intake failover tracking - • ⚠️ PROVES: Current request dropped on failover - ↓ -[TelemetryClient] ← Assertions T1-T5 - • Activity tracking - • HTTP success validation - • Failure detection (HTTP errors) - • Network exception handling - • 404 endpoint tracking - • ⚠️ PROVES: No retry on failure - ↓ -[Telemetry Endpoint] → Datadog Backend -``` - -### Trace Pipeline - -``` -Application Threads - ↓ -[CoreTracer] → Sampling decision - ↓ -[RemoteWriter] ← Assertions 10-12 - • Writer state validation - • Buffer overflow detection - • Shutdown tracking - ↓ -[TraceProcessingWorker] → Serialization queues - ↓ -[PayloadDispatcherImpl] ← Assertions 6-9 - • Activity tracking - • Trace sending validation - • Failure path detection - • Pre-send drop prevention - ↓ -[DDAgentApi] ← Assertions 1-5 - • Agent detection - • HTTP response validation - • Network exception handling - ↓ -[Datadog Agent] → Backend -``` - -### Assertion Count by Category - -| Category | Count | Criticality | Status | -|----------|-------|-------------|--------| -| **Telemetry Communication** | 5 | 🔴 **CRITICAL** | ⚠️ **DROPS DATA** | -| **Telemetry Routing** | 3 | 🔴 **CRITICAL** | ⚠️ **DROPS DATA** | -| **Agent Communication** | 5 | 🔴 **CRITICAL** | ✅ Has retries | -| **Trace Serialization** | 4 | ❌ High | ✅ Good | -| **Buffer Management** | 2 | 🔴 **CRITICAL** | ✅ Good | -| **Lifecycle Management** | 1 | 🔴 **CRITICAL** | ✅ Good | -| **Total** | **20** | - | - | - -### Key Properties Monitored - -**Telemetry System (YOUR ISSUE):** -1. ⚠️ **Telemetry Data Loss**: Telemetry dropped on HTTP/network failures -2. ⚠️ **No Retry Mechanism**: Failed requests are not retried or buffered -3. ⚠️ **Failover Data Loss**: Current request dropped during endpoint switching - -**Trace System:** -4. **Agent Availability**: Agent must be detectable and reachable -5. **HTTP Success**: All agent responses must be 200 OK -6. **Network Stability**: No IO/network exceptions should occur -7. **Buffer Capacity**: No overflow or backpressure drops -8. **Lifecycle Correctness**: No writes to closed writer -9. **End-to-End Success**: All traces must be successfully sent - -## References - -- [Antithesis Assertions Documentation](https://antithesis.com/docs/properties_assertions/assertions/) -- [Java SDK Reference](https://antithesis.com/docs/generated/sdk/java/com/antithesis/sdk/Assert.html) - diff --git a/EXPORT_VARIABLES_GUIDE.md b/EXPORT_VARIABLES_GUIDE.md deleted file mode 100644 index ace4b58d3d3..00000000000 --- a/EXPORT_VARIABLES_GUIDE.md +++ /dev/null @@ -1,282 +0,0 @@ -# Exporting DD Configuration Variables - -This document explains how to export random DD configuration variables without running a command, allowing you to use them in your own scripts. - -## Two Approaches Available - -### Approach 1: `fuzz-export-vars.sh` (Recommended) - -A standalone script that generates export statements you can eval in your shell. - -#### Usage - -```bash -# Export random variables -eval "$(./fuzz-export-vars.sh)" - -# Then run your application -java -javaagent:dd-java-agent.jar -jar myapp.jar -``` - -#### Advantages -- Simple and straightforward -- Works in any script -- No need to source anything -- Clean output - -#### Control Number of Parameters - -```bash -# Export 5 random parameters -FUZZ_MAX_PARAMS=5 eval "$(./fuzz-export-vars.sh)" -``` - -### Approach 2: `fuzz-configs.sh` in Export-Only Mode - -Use the main fuzzer script in export-only mode. - -#### Usage - -```bash -# Set export-only mode -export FUZZ_EXPORT_ONLY=true - -# Source the fuzzer (doesn't run commands) -source ./fuzz-configs.sh 1 "" -``` - -#### Advantages -- Uses the same script as full fuzzing -- Includes logging -- More detailed output - -## Examples - -### Example 1: Basic Export and Run - -```bash -#!/bin/bash - -# Export random configurations -eval "$(./fuzz-export-vars.sh)" - -# Run your application -java -javaagent:dd-java-agent.jar -jar myapp.jar -``` - -### Example 2: Multiple Test Runs with Different Configs - -```bash -#!/bin/bash - -for i in {1..10}; do - echo "Test run $i" - - # Clear previous DD variables - unset $(env | grep '^DD_' | cut -d'=' -f1) - - # Export new random configuration - eval "$(./fuzz-export-vars.sh)" - - # Run your application - java -jar myapp.jar - - sleep 2 -done -``` - -### Example 3: Export Specific Number of Parameters - -```bash -#!/bin/bash - -# Export only 3 random parameters -FUZZ_MAX_PARAMS=3 eval "$(./fuzz-export-vars.sh)" - -echo "Running with minimal configuration:" -env | grep '^DD_' - -java -jar myapp.jar -``` - -### Example 4: Capture Variables for Later Use - -```bash -#!/bin/bash - -# Generate and save export statements -./fuzz-export-vars.sh 2>/dev/null > /tmp/dd-config.sh - -# Review the configuration -cat /tmp/dd-config.sh - -# Apply it when ready -source /tmp/dd-config.sh - -# Run your application -java -jar myapp.jar -``` - -### Example 5: Use in CI/CD Pipeline - -```bash -#!/bin/bash -# .gitlab-ci.yml or similar - -test_with_random_config: - script: - - eval "$(./fuzz-export-vars.sh)" - - echo "Testing with configuration:" - - env | grep '^DD_' - - mvn clean test -``` - -## Clearing Variables - -To clear all DD_ environment variables: - -```bash -# Unset all DD_ variables -unset $(env | grep '^DD_' | cut -d'=' -f1) - -# Verify -env | grep '^DD_' # Should return nothing -``` - -## Comparing Approaches - -| Feature | fuzz-export-vars.sh | fuzz-configs.sh (export-only) | -|---------|---------------------|-------------------------------| -| Simplicity | ⭐⭐⭐ Very simple | ⭐⭐ Moderate | -| Logging | ⭐ To stderr only | ⭐⭐⭐ Full logging | -| File size | ⭐⭐⭐ Lightweight | ⭐ Larger | -| Dependencies | Just jq | jq + full script | -| Use case | Quick exports | Integrated testing | - -## Troubleshooting - -### Variables Not Exported - -```bash -# Wrong - runs in subshell, variables don't persist -$(./fuzz-export-vars.sh) - -# Correct - use eval -eval "$(./fuzz-export-vars.sh)" -``` - -### Too Many/Few Variables - -```bash -# Control the number -FUZZ_MAX_PARAMS=5 eval "$(./fuzz-export-vars.sh)" -``` - -### Need to See What's Being Exported - -```bash -# The script outputs info to stderr, so you'll see it -eval "$(./fuzz-export-vars.sh)" - -# Or capture the export statements first -./fuzz-export-vars.sh 2>&1 | tee /tmp/config.log | tail -n +2 | source /dev/stdin -``` - -## Integration Patterns - -### Pattern 1: Test Suite Integration - -```bash -#!/bin/bash -# run-test-suite.sh - -for test in tests/*.sh; do - echo "Running $test with random config..." - - # Fresh configuration for each test - unset $(env | grep '^DD_' | cut -d'=' -f1) - eval "$(./fuzz-export-vars.sh)" - - bash "$test" -done -``` - -### Pattern 2: Docker Container Testing - -```bash -#!/bin/bash - -# Generate configuration -eval "$(./fuzz-export-vars.sh)" - -# Pass to Docker container -docker run \ - $(env | grep '^DD_' | sed 's/^/-e /') \ - my-app:latest -``` - -### Pattern 3: Configuration Files - -```bash -#!/bin/bash - -# Generate Java system properties file -./fuzz-export-vars.sh 2>/dev/null | \ - sed 's/export //' | \ - sed "s/'//g" | \ - sed 's/^/-D/' | \ - sed 's/=/=/' > /tmp/java-opts.txt - -# Use with Java -java @/tmp/java-opts.txt -jar myapp.jar -``` - -## Best Practices - -1. **Clear Between Runs**: Always unset previous DD_ variables before exporting new ones -2. **Log Configuration**: Save the exported configuration for reproducibility -3. **Reasonable Limits**: Use FUZZ_MAX_PARAMS to avoid overwhelming configurations -4. **Test Isolation**: Each test should use a fresh set of variables -5. **Document**: Save configurations that expose bugs for later reproduction - -## Environment Variables - -### For `fuzz-export-vars.sh` - -- `FUZZ_MAX_PARAMS` - Maximum number of parameters to export (default: 10) - -### For `fuzz-configs.sh` (export-only mode) - -- `FUZZ_EXPORT_ONLY` - Set to "true" to enable export-only mode -- Other variables from main fuzzer still apply - -## See Also - -- `FUZZ_README.md` - Full fuzzer documentation -- `FUZZ_QUICKSTART.md` - Quick start guide -- `example-use-export-vars.sh` - Working example script - -## Summary - -**Quick Export:** -```bash -eval "$(./fuzz-export-vars.sh)" -java -jar myapp.jar -``` - -**With Options:** -```bash -FUZZ_MAX_PARAMS=5 eval "$(./fuzz-export-vars.sh)" -``` - -**In a Loop:** -```bash -for i in {1..10}; do - unset $(env | grep '^DD_' | cut -d'=' -f1) - eval "$(./fuzz-export-vars.sh)" - java -jar myapp.jar -done -``` - -That's it! You're ready to use random DD configurations in your scripts. - diff --git a/FUZZER_SUMMARY.txt b/FUZZER_SUMMARY.txt deleted file mode 100644 index 4404bc43a73..00000000000 --- a/FUZZER_SUMMARY.txt +++ /dev/null @@ -1,346 +0,0 @@ -================================================================================ -DD-TRACE-JAVA CONFIGURATION FUZZER -================================================================================ - -PROJECT: dd-trace-java Configuration Fuzzing Tool -CREATED: November 28, 2024 -PURPOSE: Test Java applications with randomized dd-trace-java configurations - -================================================================================ -FILES CREATED -================================================================================ - -1. CORE SCRIPTS - ------------ - ✓ fuzz-configs.sh (13KB) - Main fuzzing script that generates random configurations and runs your app - - Reads 1,384 parameters from metadata/supported-configurations.json - - Intelligent type detection (Boolean, Integer, Float, String) - - Generates sensible random values based on parameter patterns - - Logs all runs with full configuration and output - - Timeout protection (30s default) - - Up to 10 random parameters per run - - ✓ analyze-fuzz-logs.sh (6.3KB) - Log analyzer that examines fuzzing results - - Statistical analysis of runs - - Failure pattern detection - - Parameter frequency analysis - - Recommendations based on results - - ✓ report-config-types.sh (6KB) - Configuration type reporter - - Analyzes all 1,384 parameters - - Reports type distribution - - Shows samples of each type - - JSON export capability - - ✓ fuzz-ci.sh (4.3KB) - CI/CD integration script - - Configurable via environment variables - - Failure threshold checking - - Non-interactive mode support - - Exit codes for pipeline integration - - ✓ example-fuzz.sh (1.8KB) - Quick start demonstration script - - Prerequisite checking - - Simple test run - - Usage examples - -2. DOCUMENTATION - ------------- - ✓ FUZZ_README.md (6.6KB) - Comprehensive documentation covering: - - Features and overview - - Prerequisites and installation - - Detailed usage instructions - - Parameter type detection - - Configuration options - - Troubleshooting guide - - Advanced usage patterns - - ✓ FUZZ_QUICKSTART.md (6.1KB) - Quick start guide with: - - Fast setup instructions - - Usage examples - - Output explanations - - Common use cases - - Tips for effective fuzzing - - ✓ FUZZER_SUMMARY.txt (this file) - Complete summary of the fuzzer system - -3. CONFIGURATION - ------------- - ✓ .gitignore (updated) - Added: fuzz-logs/ directory exclusion - -================================================================================ -CAPABILITIES -================================================================================ - -PARAMETER TYPES DETECTED: -------------------------- -From 1,384 total configuration parameters: - - Boolean parameters: 779 (56%) - - Integer parameters: 95 (6%) - - Float parameters: 128 (9%) - - String parameters: 382 (27%) - -VALUE GENERATION: ----------------- -Boolean: true, false, 1, 0 -Integer: Context-aware ranges: - - Ports: 1024-65535 - - Timeouts: 100-30000ms - - Sizes/Limits: 10-10000 - - Counts: 1-100 - - Rates/Percents: 0-100 -Float: Context-aware: - - Sample rates: 0.0-1.0 - - Intervals: 1.0-60.0 -String: Intelligent generation: - - DD_ENV: production, staging, development, test, qa - - DD_SERVICE: Realistic service names - - DD_VERSION: Semantic versions (v1.2.3) - - Hosts: localhost, IPs, hostnames - - URLs: HTTP/HTTPS endpoints - - Paths: Realistic file/directory paths - - Keys/Tokens: Random hex strings - - Tags: Comma-separated key:value pairs - - Propagation styles: datadog, b3, tracecontext - - Modes: full, service, disabled, safe - -FEATURES: --------- -✓ Intelligent type detection from parameter names -✓ Realistic value generation -✓ Configurable iteration count -✓ Random parameter selection (1-10 per run) -✓ Comprehensive logging -✓ Timeout protection (prevents hangs) -✓ Color-coded terminal output -✓ Statistical analysis -✓ Failure pattern detection -✓ CI/CD integration support -✓ JSON export for reports -✓ Non-interactive mode - -================================================================================ -USAGE EXAMPLES -================================================================================ - -BASIC USAGE: ------------ -# Run 10 fuzz iterations -./fuzz-configs.sh 10 "java -jar myapp.jar" - -# With dd-java-agent -./fuzz-configs.sh 20 "java -javaagent:dd-java-agent.jar -jar myapp.jar" - -# Analyze results -./analyze-fuzz-logs.sh - -# View parameter types -./report-config-types.sh - -QUICK START: ------------ -# Run example test -./example-fuzz.sh - -CI/CD INTEGRATION: ------------------ -# In your CI pipeline (GitHub Actions, Jenkins, etc.) -export FUZZ_ITERATIONS=50 -export FUZZ_JAVA_CMD="java -javaagent:dd-java-agent.jar -jar app.jar" -export FUZZ_FAILURE_THRESHOLD=5 # Max 5% failures allowed -./fuzz-ci.sh - -# Or with direct parameters -FUZZ_ITERATIONS=100 ./fuzz-ci.sh - -ADVANCED: --------- -# Export parameter type report -./report-config-types.sh --export - -# Run multiple fuzzers in parallel -./fuzz-configs.sh 50 "java -jar app.jar" & -./fuzz-configs.sh 50 "java -jar app.jar" & -wait - -# Custom timeout (edit fuzz-configs.sh, line ~245) -# Change: timeout 30s bash -c "$JAVA_CMD" -# To: timeout 60s bash -c "$JAVA_CMD" - -================================================================================ -WORKFLOW -================================================================================ - -1. SETUP - ----- - Install jq: brew install jq (macOS) or apt-get install jq (Linux) - Ensure all scripts are executable (chmod +x *.sh) - -2. TEST - ---- - Run example: ./example-fuzz.sh - Verify logs are created in fuzz-logs/ - -3. FUZZ - ---- - Run with your app: ./fuzz-configs.sh "" - Start with 10-20 iterations, scale up as needed - -4. ANALYZE - ------- - Review results: ./analyze-fuzz-logs.sh - Check individual logs in fuzz-logs/ for details - -5. ITERATE - ------- - Increase iterations for thorough testing - Monitor app metrics during fuzzing - Identify and fix configuration issues - -6. INTEGRATE - --------- - Add to CI/CD pipeline using fuzz-ci.sh - Set failure thresholds appropriate for your app - -================================================================================ -OUTPUT STRUCTURE -================================================================================ - -LOGS DIRECTORY: fuzz-logs/ --------------------------- -Each run creates a log file: fuzz_run__.log - -Log file contents: - - Iteration number and timestamp - - Configuration parameters used - - Environment export commands - - Command executed - - Application output/errors - - Exit code (if failed) - -Example log: - # Fuzz Iteration 5 - # Timestamp: 20241128_143052 - # Configuration: - DD_TRACE_ENABLED=true - DD_SERVICE=my-service - DD_ENV=production - ... - - # Command: java -jar app.jar - ========================================== - [Application output here] - -================================================================================ -CONFIGURATION OPTIONS -================================================================================ - -In fuzz-configs.sh: ------------------- -MAX_PARAMS_PER_RUN=10 # Max parameters per iteration -LOG_DIR="./fuzz-logs" # Log directory location -timeout 30s # Timeout per run (line ~245) - -In fuzz-ci.sh: -------------- -FUZZ_ITERATIONS # Number of iterations (default: 20) -FUZZ_JAVA_CMD # Command to run (default: java -jar app.jar) -FUZZ_FAILURE_THRESHOLD # Max % failures allowed (default: 10) - -================================================================================ -PREREQUISITES -================================================================================ - -Required: - - Bash 4.0+ - - jq (JSON processor) - - timeout command (usually pre-installed) - -Optional: - - Java application with dd-java-agent - - CI/CD environment for automated testing - -Installation: - macOS: brew install jq - Ubuntu/Debian: sudo apt-get install jq - CentOS/RHEL: sudo yum install jq - -================================================================================ -TIPS FOR EFFECTIVE FUZZING -================================================================================ - -1. Start small (5-10 iterations) to verify setup -2. Gradually increase to 50-100+ for thorough testing -3. Monitor application logs and metrics during fuzzing -4. Use analyze-fuzz-logs.sh to identify failure patterns -5. Run overnight with 1000+ iterations for stress testing -6. Integrate into CI/CD for continuous testing -7. Document any configuration issues discovered -8. Share findings with your team -9. Keep fuzzer updated with new parameters -10. Run regularly, especially after configuration changes - -================================================================================ -STATISTICS -================================================================================ - -Configuration Parameters: 1,384 -Scripts Created: 5 -Documentation Files: 3 -Total Lines of Code: ~950 -Total Documentation: ~500 lines -Parameter Coverage: 100% of metadata/supported-configurations.json - -Type Detection Patterns: - - Boolean: 15+ patterns (ENABLED, DEBUG, COLLECT, etc.) - - Integer: 10+ patterns (PORT, TIMEOUT, SIZE, LIMIT, etc.) - - Float: 5+ patterns (RATE, SAMPLE_RATE, INTERVAL, etc.) - - String: 20+ specific patterns (ENV, SERVICE, HOST, URL, etc.) - -================================================================================ -SUPPORT & TROUBLESHOOTING -================================================================================ - -Common Issues: -------------- -1. "jq: command not found" → Install jq -2. "Permission denied" → chmod +x *.sh -3. All runs timeout → Increase timeout or check app startup -4. High failure rate → Review app logs, reduce parameters per run - -Documentation: -------------- -- FUZZ_QUICKSTART.md: Quick reference guide -- FUZZ_README.md: Comprehensive documentation -- Individual log files: Detailed run information - -Resources: ---------- -- DD-Trace-Java docs: https://docs.datadoghq.com/tracing/trace_collection/library_config/java/ -- Configuration reference: metadata/supported-configurations.json -- Source code: All scripts are well-commented - -================================================================================ -NEXT STEPS -================================================================================ - -1. ✓ Run ./example-fuzz.sh to verify installation -2. ✓ Review FUZZ_QUICKSTART.md for quick start -3. ✓ Test with your actual Java application -4. ✓ Analyze results with ./analyze-fuzz-logs.sh -5. ✓ Integrate into CI/CD with fuzz-ci.sh -6. ✓ Scale up iterations for thorough testing -7. ✓ Document and share findings -8. ✓ Run regularly for continuous validation - -================================================================================ -END OF SUMMARY -================================================================================ diff --git a/FUZZ_QUICKSTART.md b/FUZZ_QUICKSTART.md deleted file mode 100644 index 697915bc3b5..00000000000 --- a/FUZZ_QUICKSTART.md +++ /dev/null @@ -1,236 +0,0 @@ -# DD-Trace-Java Configuration Fuzzer - Quick Start Guide - -## What Was Created - -This fuzzing toolset helps you test your dd-trace-java application with randomized configurations to identify potential issues. - -### Files Created - -1. **`fuzz-configs.sh`** - Main fuzzer script - - Generates random but sensible configuration values - - Runs your app with different parameter combinations - - Logs all runs with full details - -2. **`analyze-fuzz-logs.sh`** - Log analyzer - - Analyzes fuzzing results - - Identifies failure patterns - - Provides statistics and recommendations - -3. **`example-fuzz.sh`** - Quick start example - - Demonstrates basic usage - - Checks prerequisites - - Runs a simple test - -4. **`FUZZ_README.md`** - Comprehensive documentation - - Detailed usage instructions - - Parameter type detection - - Troubleshooting guide - -## Quick Start - -### 1. Prerequisites - -Install `jq` (JSON processor): - -```bash -# macOS -brew install jq - -# Ubuntu/Debian -sudo apt-get install jq -``` - -### 2. Run Your First Test - -```bash -# Simple test (5 iterations with echo command) -./example-fuzz.sh - -# Or directly with your Java app: -./fuzz-configs.sh 10 "java -javaagent:dd-java-agent.jar -jar myapp.jar" -``` - -### 3. Analyze Results - -```bash -./analyze-fuzz-logs.sh -``` - -## Usage Examples - -### Basic Testing -```bash -# Run 10 iterations -./fuzz-configs.sh 10 "java -jar app.jar" -``` - -### With DD Agent -```bash -# Test with the datadog agent jar -./fuzz-configs.sh 20 "java -javaagent:./dd-java-agent/build/libs/dd-java-agent.jar -jar myapp.jar" -``` - -### Spring Boot Application -```bash -# Test Spring Boot app -./fuzz-configs.sh 15 "java -javaagent:dd-java-agent.jar -jar target/spring-app-1.0.0.jar" -``` - -### Custom Script -```bash -# Test with your startup script -./fuzz-configs.sh 30 "./start-my-app.sh" -``` - -## What the Fuzzer Does - -For each iteration, it: - -1. **Selects** 1-10 random configuration parameters (from 1384+ available) -2. **Generates** appropriate values based on parameter type: - - Booleans: `true`, `false`, `1`, `0` - - Ports: `1024-65535` - - Timeouts: `100-30000ms` - - Sample rates: `0.0-1.0` - - Strings: Realistic values (URLs, paths, service names, etc.) -3. **Runs** your application with those settings -4. **Logs** everything (config + output) -5. **Reports** success/failure/timeout - -## Understanding Output - -### During Run -``` -Iteration 5 of 10 -================================================================== -Selected 7 random parameters: - DD_TRACE_ENABLED = true - DD_SERVICE = my-service - DD_ENV = production - DD_AGENT_PORT = 8126 - DD_TRACE_SAMPLE_RATE = 0.75 - DD_PROFILING_ENABLED = true - DD_LOGS_INJECTION = false - -Running application... -✓ Iteration 5 completed successfully -``` - -### Summary -``` -Fuzzing Complete - Summary -================================================================== -Total iterations: 10 -Successful runs: 9 -Failed runs: 1 -Timeout runs: 0 -Logs directory: ./fuzz-logs -``` - -## Analyzing Logs - -### Check Individual Runs -```bash -# View a specific log -cat fuzz-logs/fuzz_run_5_20241128_143052.log - -# Find failed runs -grep -l "exit code" fuzz-logs/*.log -``` - -### Use the Analyzer -```bash -./analyze-fuzz-logs.sh -``` - -This shows: -- Success/failure statistics -- Most frequently used parameters -- Recent runs summary -- Recommendations - -## Configuration Types Detected - -The fuzzer intelligently detects parameter types: - -| Parameter Pattern | Generated Values | Examples | -|------------------|------------------|----------| -| `*_ENABLED`, `*_DEBUG` | Boolean | `true`, `false`, `1`, `0` | -| `*_PORT` | Port number | `1024-65535` | -| `*_TIMEOUT`, `*_DELAY` | Milliseconds | `100-30000` | -| `*_SAMPLE_RATE` | Float | `0.0-1.0` | -| `DD_ENV` | Environment | `production`, `staging`, `development` | -| `DD_SERVICE` | Service name | `my-service`, `web-app`, `api-gateway` | -| `*_HOST*` | Hostname | `localhost`, `127.0.0.1`, IPs | -| `*_URL`, `*_ENDPOINT` | URL | `http://localhost:8080`, etc. | -| `*_PATH`, `*_FILE` | Path | `/tmp/test`, `/var/log/app` | -| `*_KEY`, `*_TOKEN` | Hex string | Random hex | -| `*_TAGS` | Tag list | `key1:value1,key2:value2` | - -## Tips for Effective Fuzzing - -1. **Start Small**: Begin with 5-10 iterations to verify setup -2. **Increase Gradually**: Scale up to 50-100 iterations for thorough testing -3. **Monitor**: Watch app logs and metrics during fuzzing -4. **Analyze Failures**: Use `analyze-fuzz-logs.sh` to identify patterns -5. **CI/CD Integration**: Run fuzzing in your pipeline -6. **Long-Running**: Consider overnight fuzz runs with 1000+ iterations - -## Common Issues - -### "jq: command not found" -Install jq using your package manager (see Prerequisites) - -### All runs timeout -- Increase timeout in `fuzz-configs.sh` (search for `timeout 30s`) -- Check if your app is starting correctly -- Verify your command is correct - -### Permission denied -```bash -chmod +x fuzz-configs.sh analyze-fuzz-logs.sh example-fuzz.sh -``` - -### Want to test specific parameters -Edit `fuzz-configs.sh` and modify the parameter selection logic or create a focused test script - -## Next Steps - -1. ✅ Run `./example-fuzz.sh` to verify everything works -2. ✅ Test with your actual Java application -3. ✅ Analyze logs with `./analyze-fuzz-logs.sh` -4. ✅ Adjust parameters/iterations based on findings -5. ✅ Integrate into CI/CD pipeline -6. ✅ Document any configuration issues you discover - -## Advanced Usage - -### Parallel Testing -```bash -# Run multiple fuzzer instances -./fuzz-configs.sh 50 "java -jar app.jar" & -./fuzz-configs.sh 50 "java -jar app.jar" & -wait -``` - -### Custom Parameter Ranges -Edit `generate_integer()` or `generate_string()` functions in `fuzz-configs.sh` - -### Integration Test Mode -```bash -# Run with health check -./fuzz-configs.sh 20 "java -jar app.jar && curl http://localhost:8080/health" -``` - -## Support - -- See `FUZZ_README.md` for comprehensive documentation -- Check logs in `fuzz-logs/` for debugging -- Review dd-trace-java documentation at https://docs.datadoghq.com/tracing/trace_collection/library_config/java/ - ---- - -**Total Configurations Available**: 1384+ parameters from `metadata/supported-configurations.json` - -**Fuzzer Version**: 1.0.0 - diff --git a/FUZZ_README.md b/FUZZ_README.md deleted file mode 100644 index 2eab3379eb8..00000000000 --- a/FUZZ_README.md +++ /dev/null @@ -1,237 +0,0 @@ -# DD-Trace-Java Configuration Fuzzer - -A bash script that performs fuzz testing on dd-trace-java by generating random but sensible configuration combinations. - -## Overview - -This fuzzer automatically: -- Reads all available configuration parameters from `metadata/supported-configurations.json` -- Generates intelligent random values based on parameter name patterns -- Runs your Java application with different configuration combinations -- Logs all runs with their configurations and outcomes -- Provides detailed statistics at the end - -## Features - -- **Intelligent Value Generation**: The fuzzer analyzes parameter names to generate appropriate values: - - Boolean parameters (`ENABLED`, `DEBUG`, etc.) → `true`, `false`, `1`, `0` - - Port numbers → `1024-65535` - - Timeouts/delays → `100-30000ms` - - Sample rates → `0.0-1.0` - - URLs, paths, service names, tags, etc. with realistic values - -- **Configurable Parameters Per Run**: Maximum 10 parameters per run (configurable) -- **Comprehensive Logging**: Each run is logged with full configuration and output -- **Timeout Protection**: 30-second timeout per run to prevent hangs -- **Statistics**: Summary of successful/failed/timeout runs - -## Prerequisites - -- Bash 4.0+ -- `jq` (JSON processor) -- `timeout` command (usually pre-installed on Linux/macOS) - -Install jq if needed: -```bash -# macOS -brew install jq - -# Ubuntu/Debian -sudo apt-get install jq - -# CentOS/RHEL -sudo yum install jq -``` - -## Usage - -### Basic Usage - -```bash -./fuzz-configs.sh "" -``` - -### Examples - -#### Example 1: Test with a simple Java application -```bash -./fuzz-configs.sh 10 "java -javaagent:dd-java-agent/build/libs/dd-java-agent.jar -jar myapp.jar" -``` - -#### Example 2: Test with Spring Boot application -```bash -./fuzz-configs.sh 20 "java -javaagent:./dd-java-agent.jar -jar target/spring-boot-app.jar" -``` - -#### Example 3: Test with a script that starts your app -```bash -./fuzz-configs.sh 50 "./start-app.sh" -``` - -#### Example 4: Just print configurations (testing mode) -```bash -./fuzz-configs.sh 5 "echo 'Testing configuration'" -``` - -#### Example 5: Run with custom JVM options -```bash -./fuzz-configs.sh 15 "java -Xmx2g -javaagent:dd-java-agent.jar -jar app.jar" -``` - -## Output - -The fuzzer creates a `fuzz-logs` directory containing: -- Individual log files for each iteration -- Configuration used for each run -- Application output/errors -- Exit codes - -### Sample Log File Content - -``` -# Fuzz Iteration 1 -# Timestamp: 20241128_143052 -# Configuration: -DD_TRACE_ENABLED=true -DD_SERVICE=my-service -DD_ENV=production -DD_AGENT_PORT=8126 -DD_TRACE_SAMPLE_RATE=0.75 - -# Environment Exports: -export DD_TRACE_ENABLED='true' -export DD_SERVICE='my-service' -export DD_ENV='production' -export DD_AGENT_PORT='8126' -export DD_TRACE_SAMPLE_RATE='0.75' - -# Command: java -jar myapp.jar -========================================== - -[Application output here...] -``` - -## Configuration - -You can modify these variables in the script: - -```bash -MAX_PARAMS_PER_RUN=10 # Maximum parameters per iteration -LOG_DIR="./fuzz-logs" # Log directory -``` - -## Parameter Type Detection - -The fuzzer intelligently detects parameter types based on naming patterns: - -| Pattern | Type | Example Values | -|---------|------|----------------| -| `*ENABLED`, `*DEBUG` | Boolean | `true`, `false`, `1`, `0` | -| `*PORT` | Integer | `1024-65535` | -| `*TIMEOUT`, `*DELAY` | Integer (ms) | `100-30000` | -| `*SIZE`, `*LIMIT`, `*MAX*` | Integer | `10`, `100`, `1000`, `5000` | -| `*SAMPLE_RATE`, `*_RATE` | Float | `0.0-1.0` | -| `DD_ENV` | String | `production`, `staging`, `development` | -| `DD_SERVICE` | String | Service names | -| `*HOST*` | String | Hostnames/IPs | -| `*URL`, `*ENDPOINT` | String | URLs | -| `*PATH`, `*FILE` | String | File paths | -| `*KEY`, `*TOKEN` | String | Random hex strings | -| `*TAGS` | String | Comma-separated tags | -| `*PROPAGATION_STYLE` | String | `datadog`, `b3`, `tracecontext` | - -## Statistics Summary - -After all iterations, you'll see a summary like: - -``` -================================================================== - Fuzzing Complete - Summary -================================================================== -Total iterations: 50 -Successful runs: 45 -Failed runs: 3 -Timeout runs: 2 -Logs directory: ./fuzz-logs -``` - -## Exit Codes - -- `0`: All runs completed without failures -- `1`: One or more runs failed (check logs) - -## Tips - -1. **Start Small**: Begin with 5-10 iterations to ensure everything works -2. **Review Logs**: Check `fuzz-logs/` for any issues or unexpected behavior -3. **Adjust Timeout**: Modify the `timeout 30s` in the script if your app needs more time to start -4. **Continuous Testing**: Run this regularly in CI/CD to catch configuration issues early -5. **Combine with Monitoring**: Watch application metrics during fuzzing to catch subtle issues - -## Advanced Usage - -### Custom Value Ranges - -Edit the `generate_integer()` or `generate_string()` functions to customize value ranges for specific parameters. - -### Integration with CI/CD - -```bash -#!/bin/bash -# In your CI pipeline -if ! ./fuzz-configs.sh 100 "java -jar app.jar"; then - echo "Fuzz testing failed!" - exit 1 -fi -``` - -### Parallel Execution - -Run multiple fuzzer instances in parallel: - -```bash -./fuzz-configs.sh 50 "java -jar app.jar" & -./fuzz-configs.sh 50 "java -jar app.jar" & -wait -``` - -## Troubleshooting - -### Issue: "jq: command not found" -**Solution**: Install jq using your package manager (see Prerequisites) - -### Issue: Script hangs -**Solution**: The 30-second timeout should prevent this. If it persists, check your application's shutdown behavior. - -### Issue: All runs timeout -**Solution**: Increase the timeout value in the `run_fuzz_iteration()` function or check if your application is starting correctly. - -### Issue: Permission denied -**Solution**: Make sure the script is executable: `chmod +x fuzz-configs.sh` - -## Known Limitations - -- Some parameter combinations might not be compatible (e.g., conflicting settings) -- Generated values are random but may not cover all edge cases -- File paths and URLs may not point to actual resources -- Some configurations require specific formats not captured by simple pattern matching - -## Contributing - -To add support for new parameter types: - -1. Edit the `generate_value()` function -2. Add pattern matching for your parameter type -3. Implement value generation logic in the appropriate `generate_*()` function - -## License - -This script is part of the dd-trace-java project. Use according to the project's license. - -## Support - -For issues or questions: -- Check the logs in `fuzz-logs/` -- Review the dd-trace-java documentation -- Open an issue in the dd-trace-java repository - diff --git a/TRACE_LOSS_TRACKING.md b/TRACE_LOSS_TRACKING.md deleted file mode 100644 index 4d4abf38649..00000000000 --- a/TRACE_LOSS_TRACKING.md +++ /dev/null @@ -1,195 +0,0 @@ -# Trace Loss Tracking with Antithesis Assertions - -## Overview - -This document describes the simplified Antithesis assertion strategy implemented to track trace loss in dd-trace-java. - -## Implementation - -Assertions were added at 3 strategic points in the trace pipeline to provide complete visibility into where and why traces are lost: - -### 1. CoreTracer.write() - Sampling Decision Point - -**Location:** `dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java` - -**Purpose:** Track traces at the sampling decision point - -**Assertions:** -- `trace_accepted_by_sampling` - Traces that passed sampling and will be sent -- `trace_dropped_by_sampling` - Traces dropped due to sampling decision - -**Data Captured:** -- `decision`: "accepted" or "dropped_sampling" -- `trace_id`: Unique trace identifier -- `span_count`: Number of spans in the trace -- `sampling_priority`: Sampling priority value - -### 2. RemoteWriter.write() - Buffer Acceptance Point - -**Location:** `dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java` - -**Purpose:** Track traces at buffer acceptance and detect drops due to overflow or policy - -**Assertions:** -- `trace_enqueued_for_send` - Traces successfully enqueued for serialization -- `trace_dropped_buffer_overflow` - Traces dropped due to full buffer -- `trace_dropped_by_policy` - Traces dropped by policy rules -- `trace_dropped_writer_closed` - Traces dropped during shutdown - -**Data Captured:** -- `decision`: "enqueued", "dropped_buffer_overflow", "dropped_policy", or "dropped_shutdown" -- `trace_id`: Unique trace identifier (when available) -- `span_count`: Number of spans in the trace -- `sampling_priority`: Sampling priority value (when available) - -### 3. PayloadDispatcherImpl.accept() - HTTP Send Point - -**Location:** `dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java` - -**Purpose:** Track actual HTTP sends to the agent and detect failures - -**Assertions:** -- `trace_payloads_being_sent` - All send attempts (before HTTP call) -- `traces_sent_successfully` - Traces successfully sent to agent -- `traces_failed_to_send` - Traces that failed to send via HTTP - -**Data Captured:** -- `decision`: "sent_success" or "dropped_send_failed" -- `trace_count`: Number of traces in the payload -- `payload_size_bytes`: Size of the payload in bytes -- `http_status`: HTTP response status code -- `dropped_traces_in_payload`: Count of traces already dropped before this send -- `dropped_spans_in_payload`: Count of spans already dropped before this send -- `has_exception`: Whether an exception occurred (for failures) - -## Complete Trace Flow - -``` -Application → CoreTracer.write() - ↓ - [ASSERTION POINT 1: Sampling] - ↓ ↓ - published=true published=false - ↓ ↓ - ✅ trace_accepted_by_sampling ❌ trace_dropped_by_sampling - ↓ - RemoteWriter.write() - ↓ - [ASSERTION POINT 2: Buffer Acceptance] - ↓ - traceProcessingWorker.publish() - ↓ - ✅ trace_enqueued_for_send - OR - ❌ trace_dropped_buffer_overflow - ❌ trace_dropped_by_policy - ❌ trace_dropped_writer_closed - ↓ - TraceProcessingWorker (batching) - ↓ - PayloadDispatcherImpl.accept() - ↓ - [ASSERTION POINT 3: HTTP Send] - ↓ - 🔵 trace_payloads_being_sent - ↓ - api.sendSerializedTraces() - ↓ ↓ - response.success() !response.success() - ↓ ↓ - ✅ traces_sent_successfully ❌ traces_failed_to_send -``` - -## Metrics Available After Antithesis Testing - -After running Antithesis tests, you will be able to calculate: - -### Total Traces Processed -``` -Total = trace_accepted_by_sampling + trace_dropped_by_sampling -``` - -### Total Traces Lost -``` -Lost = trace_dropped_by_sampling - + trace_dropped_buffer_overflow - + trace_dropped_by_policy - + trace_dropped_writer_closed - + traces_failed_to_send -``` - -### Total Traces Successfully Sent -``` -Success = traces_sent_successfully -``` - -### Loss Rate -``` -Loss Rate = (Total Traces Lost / Total Traces Processed) * 100% -``` - -### Loss Breakdown by Cause -- **Sampling Loss:** `trace_dropped_by_sampling / Total Traces Processed` -- **Buffer Overflow Loss:** `trace_dropped_buffer_overflow / Total Traces Processed` -- **Policy Loss:** `trace_dropped_by_policy / Total Traces Processed` -- **Shutdown Loss:** `trace_dropped_writer_closed / Total Traces Processed` -- **Send Failure Loss:** `traces_failed_to_send / Total Traces Processed` - -## Assertion Properties - -All assertions use `Assert.sometimes()` which means: -- They track that the condition occurred at least once during testing -- They provide detailed context about each occurrence -- They don't fail the test (they're for tracking, not validation) - -## Benefits of This Approach - -1. **Clear Tracking:** Each assertion has a unique, descriptive name -2. **Complete Coverage:** Tracks the entire pipeline from sampling to agent -3. **Detailed Context:** Captures relevant metadata at each point -4. **Easy Analysis:** Simple math to calculate loss rates and breakdown -5. **Actionable Data:** Identifies exactly where and why traces are lost - -## Example Analysis - -After an Antithesis test run, you might see: - -``` -trace_accepted_by_sampling: 10,000 occurrences -trace_dropped_by_sampling: 90,000 occurrences -trace_enqueued_for_send: 10,000 occurrences -trace_dropped_buffer_overflow: 50 occurrences -traces_sent_successfully: 9,950 occurrences -traces_failed_to_send: 0 occurrences -``` - -**Analysis:** -- Total traces: 100,000 -- Sampling rate: 10% (10,000 accepted / 100,000 total) -- Buffer overflow: 0.05% (50 / 100,000) -- Send success rate: 99.5% (9,950 / 10,000 accepted) -- Overall success rate: 9.95% (9,950 / 100,000 total) - -**Conclusion:** -- Sampling is working as expected (90% drop rate) -- Very low buffer overflow (0.05%) -- Excellent send success rate (99.5%) -- No HTTP failures - -## Dependencies - -- **Antithesis SDK:** `com.antithesis:sdk:1.4.5` (already configured in `dd-trace-core/build.gradle`) -- The SDK is bundled in the tracer JAR and has minimal performance impact in production - -## Running Antithesis Tests - -Contact the Antithesis team or refer to their documentation for running tests with these assertions enabled. - -## Future Enhancements - -Potential improvements: -1. Add `Assert.always()` for critical paths that should never fail -2. Add `Assert.unreachable()` for error paths that should never occur -3. Track additional metadata (e.g., service names, operation names) -4. Add time-based metrics (latency, throughput) - diff --git a/analyze-fuzz-logs.sh b/analyze-fuzz-logs.sh deleted file mode 100755 index 2ac2e720cf5..00000000000 --- a/analyze-fuzz-logs.sh +++ /dev/null @@ -1,179 +0,0 @@ -#!/bin/bash - -############################################################################## -# DD-Trace-Java Fuzzer Log Analyzer -# -# Analyzes fuzz test logs to identify patterns in failures and provide -# insights into which configurations might be causing issues. -############################################################################## - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -LOG_DIR="${SCRIPT_DIR}/fuzz-logs" - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} DD-Trace-Java Fuzzer Log Analyzer${NC}" -echo -e "${BLUE}==================================================================${NC}" -echo "" - -# Check if log directory exists -if [ ! -d "$LOG_DIR" ]; then - echo -e "${RED}Error: Log directory not found: $LOG_DIR${NC}" - echo "Run the fuzzer first: ./fuzz-configs.sh " - exit 1 -fi - -# Count log files -LOG_COUNT=$(find "$LOG_DIR" -name "fuzz_run_*.log" | wc -l) - -if [ "$LOG_COUNT" -eq 0 ]; then - echo -e "${RED}No log files found in $LOG_DIR${NC}" - exit 1 -fi - -echo -e "${GREEN}Found $LOG_COUNT fuzz run logs${NC}" -echo "" - -############################################################################## -# Analyze runs -############################################################################## - -echo -e "${BLUE}Analyzing runs...${NC}" -echo "" - -successful_runs=0 -failed_runs=0 -total_params_used=() -all_params=() - -for log_file in "$LOG_DIR"/fuzz_run_*.log; do - # Check for success indicators in the log - if grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then - ((successful_runs++)) - else - ((failed_runs++)) - echo -e "${RED}Failed run: $(basename "$log_file")${NC}" - - # Extract and display the configuration that failed - echo -e "${YELLOW}Configuration:${NC}" - grep "^DD_" "$log_file" | grep -v "^#" | head -20 - echo "" - fi - - # Extract parameter names - params=$(grep "^DD_" "$log_file" | grep -v "^#" | cut -d'=' -f1) - for param in $params; do - all_params+=("$param") - done - - param_count=$(echo "$params" | wc -l) - total_params_used+=($param_count) -done - -############################################################################## -# Statistics -############################################################################## - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} Statistics${NC}" -echo -e "${BLUE}==================================================================${NC}" - -echo -e "Total runs: ${BLUE}$LOG_COUNT${NC}" -echo -e "Successful runs: ${GREEN}$successful_runs${NC}" -echo -e "Failed runs: ${RED}$failed_runs${NC}" - -if [ $LOG_COUNT -gt 0 ]; then - success_rate=$((successful_runs * 100 / LOG_COUNT)) - echo -e "Success rate: ${GREEN}${success_rate}%${NC}" -fi - -echo "" - -############################################################################## -# Parameter frequency analysis -############################################################################## - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} Most Frequently Used Parameters${NC}" -echo -e "${BLUE}==================================================================${NC}" - -if [ ${#all_params[@]} -gt 0 ]; then - # Count parameter occurrences - printf '%s\n' "${all_params[@]}" | sort | uniq -c | sort -rn | head -20 | while read count param; do - echo -e " ${GREEN}$count${NC} times: $param" - done -else - echo "No parameters found in logs" -fi - -echo "" - -############################################################################## -# Recommendations -############################################################################## - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} Recommendations${NC}" -echo -e "${BLUE}==================================================================${NC}" - -if [ $failed_runs -eq 0 ]; then - echo -e "${GREEN}✓ All runs completed successfully!${NC}" - echo "" - echo "Consider:" - echo " - Increasing the number of iterations" - echo " - Testing with more parameters per run" - echo " - Running with your actual application under load" -elif [ $failed_runs -lt $((LOG_COUNT / 10)) ]; then - echo -e "${YELLOW}⚠ Less than 10% of runs failed${NC}" - echo "" - echo "Actions:" - echo " 1. Review the failed run logs above" - echo " 2. Check for common parameters across failures" - echo " 3. Consider if failures are due to incompatible parameter combinations" -else - echo -e "${RED}⚠ More than 10% of runs failed${NC}" - echo "" - echo "Urgent actions:" - echo " 1. Review application logs for errors" - echo " 2. Check if specific parameters are causing issues" - echo " 3. Verify the application can start with basic configurations" - echo " 4. Consider running with fewer parameters per iteration" -fi - -echo "" - -############################################################################## -# Recent runs -############################################################################## - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} Recent Runs (last 5)${NC}" -echo -e "${BLUE}==================================================================${NC}" - -find "$LOG_DIR" -name "fuzz_run_*.log" -type f -print0 | \ - xargs -0 ls -t | head -5 | while read log_file; do - echo -e "${YELLOW}$(basename "$log_file")${NC}" - echo " Timestamp: $(grep "^# Timestamp:" "$log_file" | cut -d' ' -f3)" - echo " Parameters: $(grep -c "^DD_" "$log_file" | grep -v "^#")" - - # Check status - if grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then - echo -e " Status: ${GREEN}Success${NC}" - else - echo -e " Status: ${RED}Failed/Timeout${NC}" - fi - echo "" -done - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE}For detailed analysis, review individual logs in: $LOG_DIR${NC}" -echo -e "${BLUE}==================================================================${NC}" - diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java index ab37c5c187c..755c15ea667 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java @@ -34,7 +34,6 @@ import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.antithesis.sdk.Assert; /** * Handles configuration updates if required by installing a new ClassFileTransformer and triggering @@ -96,8 +95,6 @@ public void accept(Source source, Collection definiti applyNewConfiguration(newConfiguration); } catch (RuntimeException e) { ExceptionHelper.logException(LOGGER, e, "Error during accepting new debugger configuration:"); - LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.accept should sometimes throw a runtime exception (sometimes)"); - Assert.sometimes(true, "ConfigurationUpdater.accept should sometimes throw a runtime exception", null); throw e; } } @@ -146,15 +143,9 @@ private void applyNewConfiguration(Configuration newConfiguration) { currentConfiguration = newConfiguration; if (changes.hasProbeRelatedChanges()) { LOGGER.debug("Applying new probe configuration, changes: {}", changes); - LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.handleProbesChanges should sometimes be called (sometimes)"); - Assert.sometimes(true, "ConfigurationUpdater.handleProbesChanges should sometimes be called", null); handleProbesChanges(changes, newConfiguration); } - LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.applyNewConfiguration should always be successful (always)"); - Assert.always(true, "ConfigurationUpdater.applyNewConfiguration should always be successful", null); } finally { - LOGGER.debug("ANTITHESIS_ASSERT: ConfigurationUpdater.applyNewConfiguration should always be reachable (reachable)"); - Assert.reachable("ConfigurationUpdater.applyNewConfiguration should always be reachable", null); configurationLock.unlock(); } } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java index 2fe9d6d0567..b8e775d4fb1 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java @@ -50,7 +50,6 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.antithesis.sdk.Assert; /** * This is the implementation of the controller for OpenJDK. It should work for JDK 11+ today, and @@ -290,8 +289,6 @@ private static String getJfrRepositoryBase(ConfigProvider configProvider) { Files.createDirectories(repositoryPath); } catch (IOException e) { log.error("Failed to create JFR repository directory: {}", repositoryPath, e); - log.debug("ANTITHESIS_ASSERT: Failed to create JFR repository directory (unreachable)"); - Assert.unreachable("Failed to create JFR repository directory", null); throw new IllegalStateException( "Failed to create JFR repository directory: " + repositoryPath, e); } diff --git a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java index f3af9fede8d..3d306d26c40 100644 --- a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java +++ b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java @@ -35,7 +35,6 @@ import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.antithesis.sdk.Assert; /** Sets up the profiling strategy and schedules the profiling recordings. */ public final class ProfilingSystem { @@ -197,12 +196,8 @@ private void startProfilingRecording() { if (t != null) { if (t instanceof IllegalStateException && "Shutdown in progress".equals(t.getMessage())) { ProfilerFlareLogger.getInstance().log("Shutdown in progress, cannot start profiling"); - log.debug("ANTITHESIS_ASSERT: Shutdown in progress, cannot start profiling (sometimes)"); - Assert.sometimes(true, "Shutdown in progress, cannot start profiling", null); } else { ProfilerFlareLogger.getInstance().log("Failed to start profiling", t); - log.debug("ANTITHESIS_ASSERT: Failed to start profiling (unreachable)", t); - Assert.unreachable("Failed to start profiling", null); throw t instanceof RuntimeException ? (RuntimeException) t : new RuntimeException(t); } } @@ -279,8 +274,6 @@ public void snapshot(boolean onShutdown) { // the last recording end time plus one nano second. The reason for this is that when // JFR is filtering the stream it will only discard earlier chunks that have an end // time that is before (not before or equal to) the requested start time of the filter. - log.debug("ANTITHESIS_ASSERT: Snapshot created (always) - lastSnapshot != null: {}", (lastSnapshot != null)); - Assert.always(lastSnapshot != null, "Snapshot created", null); lastSnapshot = recordingData.getEnd().plus(ONE_NANO); dataListener.onNewData(recordingType, recordingData, onShutdown); } else { @@ -288,8 +281,6 @@ public void snapshot(boolean onShutdown) { } } catch (final Exception e) { log.error(SEND_TELEMETRY, "Exception in profiling thread, continuing", e); - log.debug("ANTITHESIS_ASSERT: Exception in profiling thread, continuing (unreachable)", e); - Assert.unreachable("Exception in profiling thread, continuing", null); } catch (final Throwable t) { /* Try to continue even after fatal exception. It seems to be useful to attempt to store profile when this happens. @@ -302,8 +293,6 @@ public void snapshot(boolean onShutdown) { } catch (final Throwable t2) { // This should almost never happen and there is not much we can do here in cases like // OutOfMemoryError, so we will just ignore this. - log.debug("ANTITHESIS_ASSERT: Fatal exception in profiling thread, trying to continue (unreachable)"); - Assert.unreachable("Fatal exception in profiling thread, trying to continue", null); } } } diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java index 8f1820b7393..098b1b54bf3 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java +++ b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java @@ -56,7 +56,6 @@ import javax.annotation.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.antithesis.sdk.Assert; /** * It is currently assumed that this class can be initialised early so that Datadog profiler's * thread filter captures all tracing activity, which means it must not be modified to depend on @@ -189,8 +188,6 @@ public OngoingRecording start() { return new DatadogProfilerRecording(this); } catch (IOException | IllegalStateException e) { log.debug("Failed to start Datadog profiler recording", e); - log.debug("ANTITHESIS_ASSERT: Failed to start Datadog profiler recording (unreachable)"); - Assert.unreachable("Failed to start Datadog profiler recording", null); return null; } } @@ -205,16 +202,12 @@ public RecordingData stop(OngoingRecording recording) { void stopProfiler() { if (recordingFlag.compareAndSet(true, false)) { profiler.stop(); - log.debug("ANTITHESIS_ASSERT: Checking if profiling is still active after stop (sometimes) - active: {}", isActive()); - Assert.sometimes(isActive(),"Profiling is still active. Waiting to stop.", null); if (isActive()) { log.debug("Profiling is still active. Waiting to stop."); while (isActive()) { LockSupport.parkNanos(10_000_000L); } } - log.debug("ANTITHESIS_ASSERT: Profiling should be stopped (always) - active: {}", isActive()); - Assert.always(!isActive(),"Profiling is stopped", null); } } @@ -228,8 +221,6 @@ public boolean isActive() { log.debug("Datadog Profiler Status = {}", status); return !status.contains("not active"); } catch (IOException ignored) { - log.debug("ANTITHESIS_ASSERT: Failed to get Datadog profiler status (unreachable)"); - Assert.unreachable("Failed to get Datadog profiler status", null); } return false; } @@ -252,14 +243,10 @@ Path newRecording() throws IOException, IllegalStateException { log.warn("Unable to start Datadog profiler recording: {}", e.getMessage()); } recordingFlag.set(false); - log.debug("ANTITHESIS_ASSERT: Unable to start Datadog profiler recording (unreachable)"); - Assert.unreachable("Unable to start Datadog profiler recording", null); throw e; } return recFile; } - log.debug("ANTITHESIS_ASSERT: Datadog profiler session has already been started (unreachable)"); - Assert.unreachable("Datadog profiler session has already been started", null); throw new IllegalStateException("Datadog profiler session has already been started"); } diff --git a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java index 0ca6737d926..ab588da6e1a 100644 --- a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java +++ b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java @@ -69,7 +69,6 @@ import okhttp3.ResponseBody; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.antithesis.sdk.Assert; /** The class for uploading profiles to the backend. */ public final class ProfileUploader { @@ -302,8 +301,6 @@ public void onFailure(final Call call, final IOException e) { // But, in any case, we have this safety-break in place to prevent blocking finishing the // sync request to a misbehaving server. if (handled.compareAndSet(false, true)) { - log.debug("ANTITHESIS_ASSERT: Upload timeout (unreachable)"); - Assert.unreachable("Upload timeout", null); handleFailure(call, null, data, onCompletion); } } @@ -354,8 +351,6 @@ private void handleResponse( "Failed to upload profile, it's too big. Dumping information about the profile"); JfrCliHelper.invokeOn(data, ioLogger); } else { - log.debug("ANTITHESIS_ASSERT: Failed to upload profile (unreachable) - response code: {}", response.code()); - Assert.unreachable("Failed to upload profile", null); ioLogger.error("Failed to upload profile", getLoggerResponse(response)); } } diff --git a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java index 3d27d6c4cd1..c73b618edb8 100644 --- a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java +++ b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ProfilingAgent.java @@ -37,7 +37,6 @@ import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.antithesis.sdk.Assert; /** Profiling agent implementation */ public class ProfilingAgent { @@ -82,8 +81,6 @@ public void onNewData(RecordingType type, RecordingData data, boolean handleSync log.debug("Debug profile stored as {}", tmp); } catch (IOException e) { log.debug("Unable to write debug profile dump", e); - log.debug("ANTITHESIS_ASSERT: Unable to write debug profile dump (unreachable)"); - Assert.unreachable("Unable to write debug profile dump", null); } } } @@ -172,15 +169,11 @@ public static synchronized boolean run(final boolean earlyStart, Instrumentation This means that if/when we implement functionality to manually shutdown profiler we would need to not forget to add code that removes this shutdown hook from JVM. */ - log.debug("ANTITHESIS_ASSERT: Shutdown hook added (always) - uploader != null: {}", (uploader != null)); - Assert.always(uploader!= null, "Shutdown hook added", null); Runtime.getRuntime().addShutdownHook(new ShutdownHook(profiler, uploader)); } catch (final IllegalStateException ex) { // The JVM is already shutting down. } } catch (final UnsupportedEnvironmentException | ConfigurationException e) { - log.debug("ANTITHESIS_ASSERT: Failed to initialize profiling agent (unreachable)", e); - Assert.unreachable("Failed to initialize profiling agent!", null); ProfilerFlareLogger.getInstance().log("Failed to initialize profiling agent!", e); ProfilerFlareReporter.reportInitializationException(e); } diff --git a/example-export-only.sh b/example-export-only.sh deleted file mode 100755 index 2823d8c5c5c..00000000000 --- a/example-export-only.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -############################################################################## -# Example: Using fuzz-configs.sh in export-only mode -# -# This script demonstrates how to call fuzz-configs.sh from another script -# and use the exported environment variables. -############################################################################## - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -echo "======================================================================" -echo "Example: Using Fuzz Configs in Export-Only Mode" -echo "======================================================================" -echo "" - -# Enable export-only mode -export FUZZ_EXPORT_ONLY=true - -# Run the fuzzer once to export variables (no command execution) -echo "Step 1: Exporting random configuration variables..." -echo "----------------------------------------------------------------------" -source "${SCRIPT_DIR}/fuzz-configs.sh" 1 "" - -echo "" -echo "======================================================================" -echo "Step 2: Using the exported variables" -echo "======================================================================" -echo "" - -# Now you can use the exported variables -echo "Example 1: List all DD_ variables that were exported:" -env | grep "^DD_" | head -20 -echo "" - -echo "Example 2: Run your command with the exported variables:" -echo "java -javaagent:dd-java-agent.jar -jar myapp.jar" -echo "" - -echo "Example 3: Or run multiple commands with the same configuration:" -echo "----------------------------------------------------------------------" -echo "Command 1: Check configuration" -env | grep "^DD_" | wc -l -echo "Total DD_ variables exported: $(env | grep "^DD_" | wc -l)" - -echo "" -echo "Command 2: You can now run your Java application" -echo "(Skipping actual execution for this demo)" -# java -javaagent:dd-java-agent.jar -jar myapp.jar - -echo "" -echo "======================================================================" -echo "Complete!" -echo "======================================================================" -echo "" -echo "The exported variables remain in your shell environment until you" -echo "unset them or the shell session ends." -echo "" -echo "To use this pattern in your own script:" -echo " 1. Set: export FUZZ_EXPORT_ONLY=true" -echo " 2. Source: source ./fuzz-configs.sh 1 \"\"" -echo " 3. Use the DD_* environment variables as needed" -echo " 4. Run your Java application with the exported configs" -echo "" - diff --git a/example-fuzz.sh b/example-fuzz.sh deleted file mode 100755 index 6770024d6d3..00000000000 --- a/example-fuzz.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -############################################################################## -# Example script showing how to use the dd-trace-java fuzzer -# This creates a minimal test application for demonstration -############################################################################## - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -echo "DD-Trace-Java Fuzzer - Quick Start Example" -echo "==========================================" -echo "" - -# Check if fuzz-configs.sh exists -if [ ! -f "${SCRIPT_DIR}/fuzz-configs.sh" ]; then - echo "Error: fuzz-configs.sh not found in ${SCRIPT_DIR}" - exit 1 -fi - -# Check if jq is installed -if ! command -v jq &> /dev/null; then - echo "Error: jq is not installed. Please install it first:" - echo " macOS: brew install jq" - echo " Ubuntu/Debian: sudo apt-get install jq" - echo " CentOS/RHEL: sudo yum install jq" - exit 1 -fi - -echo "Running fuzzer with 5 test iterations..." -echo "" -echo "This will generate random dd-trace-java configurations and" -echo "run a simple echo command to demonstrate the fuzzer." -echo "" -echo "For real testing, replace the command with your Java application:" -echo " ./fuzz-configs.sh 10 'java -javaagent:dd-java-agent.jar -jar myapp.jar'" -echo "" -echo "Starting in 3 seconds..." -sleep 3 - -# Run the fuzzer with a simple echo command for demonstration -"${SCRIPT_DIR}/fuzz-configs.sh" 5 "echo 'Test run completed with above configuration'" - -echo "" -echo "==========================================" -echo "Example completed!" -echo "" -echo "Check the fuzz-logs/ directory for detailed logs of each run." -echo "" -echo "Next steps:" -echo "1. Review FUZZ_README.md for detailed documentation" -echo "2. Run with your actual Java application" -echo "3. Analyze the logs to identify any configuration issues" -echo "" - diff --git a/example-use-export-vars.sh b/example-use-export-vars.sh deleted file mode 100755 index 461e7ecd00e..00000000000 --- a/example-use-export-vars.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -############################################################################## -# Example: Using fuzz-export-vars.sh to export random configurations -# -# This demonstrates how to export DD configuration variables from another -# script and use them to run your Java application. -############################################################################## - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -echo "======================================================================" -echo "Example: Export Random DD Configurations" -echo "======================================================================" -echo "" - -echo "Step 1: Export random configuration variables" -echo "----------------------------------------------------------------------" - -# Export variables using eval -eval "$(${SCRIPT_DIR}/fuzz-export-vars.sh)" - -echo "" -echo "======================================================================" -echo "Step 2: View exported variables" -echo "======================================================================" -echo "" - -# Show all exported DD_ variables -echo "Exported DD_ configuration variables:" -env | grep "^DD_" | sort -echo "" -echo "Total: $(env | grep "^DD_" | wc -l | tr -d " ") DD_ variables exported" -echo "" - -echo "======================================================================" -echo "Step 3: Run your Java application" -echo "======================================================================" -echo "" - -echo "Now you can run your Java application with these configurations:" -echo "" -echo " java -javaagent:dd-java-agent.jar -jar myapp.jar" -echo "" -echo "Or any other command that uses DD_ environment variables." -echo "" - -echo "======================================================================" -echo "Additional Examples" -echo "======================================================================" -echo "" - -echo "1. Export and run immediately:" -echo " eval \"\$(./fuzz-export-vars.sh)\" && java -jar myapp.jar" -echo "" - -echo "2. Export specific number of parameters:" -echo " FUZZ_MAX_PARAMS=5 eval \"\$(./fuzz-export-vars.sh)\"" -echo "" - -echo "3. Use in a loop for multiple test runs:" -echo " for i in {1..10}; do" -echo " unset \$(env | grep '^DD_' | cut -d'=' -f1)" -echo " eval \"\$(./fuzz-export-vars.sh)\"" -echo " java -jar myapp.jar" -echo " done" -echo "" - -echo "======================================================================" -echo "Complete!" -echo "======================================================================" - diff --git a/fuzz-ci.sh b/fuzz-ci.sh deleted file mode 100755 index 4f310893c6b..00000000000 --- a/fuzz-ci.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/bash - -############################################################################## -# DD-Trace-Java Fuzzer - CI/CD Integration Script -# -# This script can be used in CI/CD pipelines to run configuration fuzzing -# as part of automated testing. -############################################################################## - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Configuration - adjust these for your CI environment -ITERATIONS="${FUZZ_ITERATIONS:-20}" -JAVA_CMD="${FUZZ_JAVA_CMD:-java -jar app.jar}" -FAILURE_THRESHOLD="${FUZZ_FAILURE_THRESHOLD:-10}" # Maximum % of failures allowed - -# Colors (disabled in non-interactive mode) -if [ -t 1 ]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[1;33m' - BLUE='\033[0;34m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -echo "==========================================" -echo "DD-Trace-Java Fuzzer - CI/CD Mode" -echo "==========================================" -echo "" -echo "Configuration:" -echo " Iterations: $ITERATIONS" -echo " Command: $JAVA_CMD" -echo " Failure threshold: ${FAILURE_THRESHOLD}%" -echo "" - -# Check prerequisites -if ! command -v jq &> /dev/null; then - echo -e "${RED}Error: jq is not installed${NC}" - echo "Install it before running this script:" - echo " Ubuntu/Debian: apt-get install jq" - echo " CentOS/RHEL: yum install jq" - echo " macOS: brew install jq" - exit 1 -fi - -if [ ! -f "${SCRIPT_DIR}/fuzz-configs.sh" ]; then - echo -e "${RED}Error: fuzz-configs.sh not found in ${SCRIPT_DIR}${NC}" - exit 1 -fi - -# Run fuzzer -echo "Starting fuzz testing..." -echo "" - -if "${SCRIPT_DIR}/fuzz-configs.sh" "$ITERATIONS" "$JAVA_CMD"; then - FUZZ_EXIT_CODE=0 -else - FUZZ_EXIT_CODE=$? -fi - -echo "" -echo "==========================================" -echo "Analyzing Results" -echo "==========================================" -echo "" - -# Analyze results -LOG_DIR="${SCRIPT_DIR}/fuzz-logs" -if [ ! -d "$LOG_DIR" ]; then - echo -e "${RED}Error: Log directory not found${NC}" - exit 1 -fi - -# Count successes and failures -TOTAL_LOGS=$(find "$LOG_DIR" -name "fuzz_run_*.log" | wc -l) -SUCCESSFUL_RUNS=0 -FAILED_RUNS=0 - -for log_file in "$LOG_DIR"/fuzz_run_*.log; do - if grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then - ((SUCCESSFUL_RUNS++)) - else - ((FAILED_RUNS++)) - fi -done - -if [ "$TOTAL_LOGS" -gt 0 ]; then - FAILURE_RATE=$((FAILED_RUNS * 100 / TOTAL_LOGS)) -else - FAILURE_RATE=0 -fi - -echo "Results:" -echo " Total runs: $TOTAL_LOGS" -echo " Successful: $SUCCESSFUL_RUNS" -echo " Failed: $FAILED_RUNS" -echo " Failure rate: ${FAILURE_RATE}%" -echo "" - -# Check against threshold -if [ "$FAILURE_RATE" -gt "$FAILURE_THRESHOLD" ]; then - echo -e "${RED}✗ FAILED: Failure rate (${FAILURE_RATE}%) exceeds threshold (${FAILURE_THRESHOLD}%)${NC}" - echo "" - echo "Failed runs:" - for log_file in "$LOG_DIR"/fuzz_run_*.log; do - if ! grep -q "Test completed\|✓\|SUCCESS\|Started" "$log_file" 2>/dev/null; then - echo " - $(basename "$log_file")" - echo " Configuration:" - grep "^DD_" "$log_file" | grep -v "^#" | head -5 | sed 's/^/ /' - fi - done - echo "" - echo "For detailed analysis, review logs in: $LOG_DIR" - exit 1 -else - echo -e "${GREEN}✓ PASSED: Failure rate (${FAILURE_RATE}%) is within threshold (${FAILURE_THRESHOLD}%)${NC}" - - if [ "$FAILED_RUNS" -gt 0 ]; then - echo "" - echo -e "${YELLOW}Note: $FAILED_RUNS run(s) failed but within acceptable threshold${NC}" - fi - - exit 0 -fi - diff --git a/fuzz-configs.sh b/fuzz-configs.sh deleted file mode 100755 index 3aa058d682a..00000000000 --- a/fuzz-configs.sh +++ /dev/null @@ -1,374 +0,0 @@ -#!/bin/bash - -############################################################################## -# DD-Trace-Java Configuration Fuzzer -# -# This script generates random but sensible configuration values for -# dd-trace-java and runs your application with them for testing. -# -# Usage: ./fuzz-configs.sh -# -# Example: ./fuzz-configs.sh 10 "java -jar myapp.jar" -# -# Export-Only Mode: -# Set FUZZ_EXPORT_ONLY=true to only export variables without running command. -# This allows you to source the script from another script and use the vars. -# -# Example: FUZZ_EXPORT_ONLY=true source ./fuzz-configs.sh 1 "" -############################################################################## - -set -e - -# Configuration -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CONFIG_FILE="${SCRIPT_DIR}/metadata/supported-configurations.json" -MAX_PARAMS_PER_RUN=10 -LOG_DIR="${SCRIPT_DIR}/fuzz-logs" -ITERATIONS="${1:-5}" -JAVA_CMD="${2:-echo 'No Java command specified. Using echo for testing'}" - -# Export-only mode: if set to "true", only exports variables without running command -EXPORT_ONLY_MODE="${FUZZ_EXPORT_ONLY:-false}" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# Create log directory -mkdir -p "$LOG_DIR" - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} DD-Trace-Java Configuration Fuzzer${NC}" -echo -e "${BLUE}==================================================================${NC}" -echo "" - -# Extract all configuration keys from the JSON file -if [ ! -f "$CONFIG_FILE" ]; then - echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}" - exit 1 -fi - -echo -e "${YELLOW}Extracting configuration parameters...${NC}" -CONFIGS=($(jq -r '.supportedConfigurations | keys[]' "$CONFIG_FILE")) -TOTAL_CONFIGS=${#CONFIGS[@]} -echo -e "${GREEN}Found $TOTAL_CONFIGS configuration parameters${NC}" -echo "" - -############################################################################## -# Function to generate a random boolean value -############################################################################## -generate_boolean() { - local values=("true" "false" "1" "0") - echo "${values[$((RANDOM % ${#values[@]}))]}" -} - -############################################################################## -# Function to generate a random integer -############################################################################## -generate_integer() { - local param_name="$1" - - # Analyze parameter name for hints about range - if [[ "$param_name" =~ PORT ]]; then - echo $((1024 + RANDOM % 64512)) # Port range: 1024-65535 - elif [[ "$param_name" =~ TIMEOUT|DELAY ]]; then - echo $((100 + RANDOM % 30000)) # Timeout: 100-30000ms - elif [[ "$param_name" =~ SIZE|LIMIT|MAX|DEPTH ]]; then - local max_values=(10 50 100 500 1000 5000 10000) - echo "${max_values[$((RANDOM % ${#max_values[@]}))]}" - elif [[ "$param_name" =~ COUNT|NUM ]]; then - echo $((1 + RANDOM % 100)) # Count: 1-100 - elif [[ "$param_name" =~ RATE|PERCENT ]]; then - echo $((RANDOM % 101)) # Rate: 0-100 - else - echo $((RANDOM % 1000)) # Default: 0-999 - fi -} - -############################################################################## -# Function to generate a random float/rate -############################################################################## -generate_float() { - local param_name="$1" - - if [[ "$param_name" =~ RATE|SAMPLE ]]; then - # Sample rates typically 0.0-1.0 - echo "0.$((RANDOM % 100))" - elif [[ "$param_name" =~ INTERVAL ]]; then - # Intervals can be larger - echo "$((1 + RANDOM % 60)).$((RANDOM % 100))" - else - echo "$((RANDOM % 100)).$((RANDOM % 100))" - fi -} - -############################################################################## -# Function to generate a random string value -############################################################################## -generate_string() { - local param_name="$1" - - # Analyze parameter name for appropriate string type - if [[ "$param_name" =~ ^DD_ENV$ ]]; then - local envs=("production" "staging" "development" "test" "qa") - echo "${envs[$((RANDOM % ${#envs[@]}))]}" - - elif [[ "$param_name" =~ ^DD_SERVICE$ ]]; then - local services=("my-service" "web-app" "api-gateway" "microservice-${RANDOM}") - echo "${services[$((RANDOM % ${#services[@]}))]}" - - elif [[ "$param_name" =~ ^DD_VERSION$ ]]; then - echo "v$((1 + RANDOM % 3)).$((RANDOM % 10)).$((RANDOM % 20))" - - elif [[ "$param_name" =~ HOST|HOSTNAME ]]; then - local hosts=("localhost" "127.0.0.1" "agent.local" "192.168.1.100" "datadog-agent") - echo "${hosts[$((RANDOM % ${#hosts[@]}))]}" - - elif [[ "$param_name" =~ URL|ENDPOINT|URI ]]; then - local urls=("http://localhost:8080" "https://api.example.com" "http://127.0.0.1:9000" "https://agent.datadoghq.com") - echo "${urls[$((RANDOM % ${#urls[@]}))]}" - - elif [[ "$param_name" =~ PATH|FILE|DIR ]]; then - local paths=("/tmp/test" "/var/log/app" "/opt/datadog" "./config" "/etc/datadog") - echo "${paths[$((RANDOM % ${#paths[@]}))]}" - - elif [[ "$param_name" =~ KEY|TOKEN ]]; then - # Generate random hex string - echo "$(head -c 16 /dev/urandom | xxd -p -c 32)" - - elif [[ "$param_name" =~ LEVEL ]]; then - local levels=("DEBUG" "INFO" "WARN" "ERROR" "TRACE" "OFF") - echo "${levels[$((RANDOM % ${#levels[@]}))]}" - - elif [[ "$param_name" =~ MODE ]]; then - local modes=("full" "service" "disabled" "safe" "extended") - echo "${modes[$((RANDOM % ${#modes[@]}))]}" - - elif [[ "$param_name" =~ TAGS$ ]]; then - local tag_count=$((1 + RANDOM % 3)) - local tags=() - for ((i=0; i "$log_file" - - # Export all selected config variables using the same values - for param in "${!param_values[@]}"; do - export "${param}=${param_values[$param]}" - done - - # If in export-only mode, skip running the command - local exit_code=0 - if [ "$EXPORT_ONLY_MODE" = "true" ]; then - echo "" - echo -e "${GREEN}✓ Variables exported successfully (export-only mode)${NC}" - echo -e "${YELLOW}Note: Variables are exported in the current shell environment${NC}" - else - # Set environment variables and run command - echo "" - echo -e "${YELLOW}Running application...${NC}" - - # Run the command with timeout - if timeout 30s bash -c "$JAVA_CMD" >> "$log_file" 2>&1; then - echo -e "${GREEN}✓ Iteration $iteration completed successfully${NC}" - else - exit_code=$? - if [ $exit_code -eq 124 ]; then - echo -e "${YELLOW}⚠ Iteration $iteration timed out (30s limit)${NC}" - else - echo -e "${RED}✗ Iteration $iteration failed with exit code: $exit_code${NC}" - fi - fi - - # Clean up environment variables after running - for idx in "${selected_indices[@]}"; do - local param="${CONFIGS[$idx]}" - unset "$param" - done - fi - - echo -e "${BLUE}Log saved to: $log_file${NC}" - echo "" - - return $exit_code -} - -############################################################################## -# Main execution -############################################################################## - -# Validate iterations parameter -if ! [[ "$ITERATIONS" =~ ^[0-9]+$ ]] || [ "$ITERATIONS" -lt 1 ]; then - echo -e "${RED}Error: Invalid iterations count: $ITERATIONS${NC}" - echo "Usage: $0 " - exit 1 -fi - -echo -e "${YELLOW}Starting fuzzer with $ITERATIONS iterations${NC}" -echo -e "${YELLOW}Maximum $MAX_PARAMS_PER_RUN parameters per run${NC}" -if [ "$EXPORT_ONLY_MODE" = "true" ]; then - echo -e "${YELLOW}Mode: Export-only (variables will be exported, command will not run)${NC}" -else - echo -e "${YELLOW}Java command: $JAVA_CMD${NC}" -fi -echo "" - -# Track statistics -successful_runs=0 -failed_runs=0 -timeout_runs=0 - -# Run iterations -for ((i=1; i<=ITERATIONS; i++)); do - run_fuzz_iteration $i - exit_code=$? - - if [ $exit_code -eq 0 ]; then - ((successful_runs++)) - elif [ $exit_code -eq 124 ]; then - ((timeout_runs++)) - else - ((failed_runs++)) - fi - - # Brief pause between iterations - if [ $i -lt $ITERATIONS ]; then - sleep 2 - fi -done - -# Print summary -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} Fuzzing Complete - Summary${NC}" -echo -e "${BLUE}==================================================================${NC}" -echo -e "Total iterations: ${BLUE}$ITERATIONS${NC}" -echo -e "Successful runs: ${GREEN}$successful_runs${NC}" -echo -e "Failed runs: ${RED}$failed_runs${NC}" -echo -e "Timeout runs: ${YELLOW}$timeout_runs${NC}" -echo -e "Logs directory: ${BLUE}$LOG_DIR${NC}" -echo "" - -if [ $failed_runs -gt 0 ]; then - echo -e "${RED}⚠ Some runs failed. Check logs for details.${NC}" - exit 1 -else - echo -e "${GREEN}✓ All runs completed without failures!${NC}" - exit 0 -fi - diff --git a/fuzz-export-vars.sh b/fuzz-export-vars.sh deleted file mode 100755 index 429720d5d3c..00000000000 --- a/fuzz-export-vars.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/bin/bash - -############################################################################## -# DD-Trace-Java Configuration Fuzzer - Export Generator -# -# This script generates export statements for random dd-trace-java -# configuration parameters. Use it with eval to export variables. -# -# Usage: eval "$(./fuzz-export-vars.sh)" -# -# Example: -# eval "$(./fuzz-export-vars.sh)" -# java -javaagent:dd-java-agent.jar -jar myapp.jar -############################################################################## - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CONFIG_FILE="${SCRIPT_DIR}/metadata/supported-configurations.json" -MAX_PARAMS="${FUZZ_MAX_PARAMS:-10}" - -# Extract all configuration keys -if [ ! -f "$CONFIG_FILE" ]; then - echo "# Error: Configuration file not found: $CONFIG_FILE" >&2 - exit 1 -fi - -CONFIGS=($(jq -r '.supportedConfigurations | keys[]' "$CONFIG_FILE" 2>/dev/null)) -TOTAL_CONFIGS=${#CONFIGS[@]} - -if [ "$TOTAL_CONFIGS" -eq 0 ]; then - echo "# Error: No configurations found" >&2 - exit 1 -fi - -############################################################################## -# Value generation functions (same as fuzz-configs.sh) -############################################################################## - -generate_boolean() { - local values=("true" "false" "1" "0") - echo "${values[$((RANDOM % ${#values[@]}))]}" -} - -generate_integer() { - local param_name="$1" - if [[ "$param_name" =~ PORT ]]; then - echo $((1024 + RANDOM % 64512)) - elif [[ "$param_name" =~ TIMEOUT|DELAY ]]; then - echo $((100 + RANDOM % 30000)) - elif [[ "$param_name" =~ SIZE|LIMIT|MAX|DEPTH ]]; then - local max_values=(10 50 100 500 1000 5000 10000) - echo "${max_values[$((RANDOM % ${#max_values[@]}))]}" - elif [[ "$param_name" =~ COUNT|NUM ]]; then - echo $((1 + RANDOM % 100)) - elif [[ "$param_name" =~ RATE|PERCENT ]]; then - echo $((RANDOM % 101)) - else - echo $((RANDOM % 1000)) - fi -} - -generate_float() { - local param_name="$1" - if [[ "$param_name" =~ RATE|SAMPLE ]]; then - echo "0.$((RANDOM % 100))" - elif [[ "$param_name" =~ INTERVAL ]]; then - echo "$((1 + RANDOM % 60)).$((RANDOM % 100))" - else - echo "$((RANDOM % 100)).$((RANDOM % 100))" - fi -} - -generate_string() { - local param_name="$1" - - if [[ "$param_name" =~ ^DD_ENV$ ]]; then - local envs=("production" "staging" "development" "test" "qa") - echo "${envs[$((RANDOM % ${#envs[@]}))]}" - elif [[ "$param_name" =~ ^DD_SERVICE$ ]]; then - local services=("my-service" "web-app" "api-gateway" "microservice-${RANDOM}") - echo "${services[$((RANDOM % ${#services[@]}))]}" - elif [[ "$param_name" =~ ^DD_VERSION$ ]]; then - echo "v$((1 + RANDOM % 3)).$((RANDOM % 10)).$((RANDOM % 20))" - elif [[ "$param_name" =~ HOST|HOSTNAME ]]; then - local hosts=("localhost" "127.0.0.1" "agent.local" "192.168.1.100" "datadog-agent") - echo "${hosts[$((RANDOM % ${#hosts[@]}))]}" - elif [[ "$param_name" =~ URL|ENDPOINT|URI ]]; then - local urls=("http://localhost:8080" "https://api.example.com" "http://127.0.0.1:9000") - echo "${urls[$((RANDOM % ${#urls[@]}))]}" - elif [[ "$param_name" =~ PATH|FILE|DIR ]]; then - local paths=("/tmp/test" "/var/log/app" "/opt/datadog" "./config") - echo "${paths[$((RANDOM % ${#paths[@]}))]}" - elif [[ "$param_name" =~ KEY|TOKEN ]]; then - echo "$(head -c 16 /dev/urandom | xxd -p -c 32)" - elif [[ "$param_name" =~ LEVEL ]]; then - local levels=("DEBUG" "INFO" "WARN" "ERROR" "TRACE") - echo "${levels[$((RANDOM % ${#levels[@]}))]}" - elif [[ "$param_name" =~ MODE ]]; then - local modes=("full" "service" "disabled" "safe") - echo "${modes[$((RANDOM % ${#modes[@]}))]}" - elif [[ "$param_name" =~ TAGS$ ]]; then - echo "key1:value${RANDOM},key2:value${RANDOM}" - elif [[ "$param_name" =~ PROPAGATION_STYLE ]]; then - local styles=("datadog" "b3" "tracecontext" "datadog,b3") - echo "${styles[$((RANDOM % ${#styles[@]}))]}" - else - local generic=("test-value" "example" "config-${RANDOM}" "auto" "default") - echo "${generic[$((RANDOM % ${#generic[@]}))]}" - fi -} - -generate_value() { - local param_name="$1" - - if [[ "$param_name" =~ ENABLED$|^DD_TRACE_ENABLED$|DEBUG$|COLLECT|HEADER_COLLECTION|REPORTING|SPLIT_BY ]]; then - generate_boolean - elif [[ "$param_name" =~ PORT$|TIMEOUT$|DELAY$|SIZE$|LIMIT$|MAX_|DEPTH$|COUNT$|QUEUE_SIZE$|BUFFER ]]; then - generate_integer "$param_name" - elif [[ "$param_name" =~ SAMPLE_RATE$|_RATE$ ]] && [[ ! "$param_name" =~ TRACE_RATE_LIMIT ]]; then - generate_float "$param_name" - elif [[ "$param_name" =~ INTERVAL$ ]]; then - if [[ "$param_name" =~ FLUSH_INTERVAL ]]; then - generate_float "$param_name" - else - generate_integer "$param_name" - fi - else - generate_string "$param_name" - fi -} - -############################################################################## -# Generate export statements -############################################################################## - -# Determine number of parameters -num_params=$((1 + RANDOM % MAX_PARAMS)) - -# Select random parameters -selected_indices=() -while [ ${#selected_indices[@]} -lt $num_params ]; do - idx=$((RANDOM % TOTAL_CONFIGS)) - if [[ ! " ${selected_indices[@]} " =~ " ${idx} " ]]; then - selected_indices+=($idx) - fi -done - -# Output comment header (to stderr so it doesn't affect eval) -echo "# Exporting $num_params random DD configuration parameters..." >&2 - -# Generate export statements -for idx in "${selected_indices[@]}"; do - param="${CONFIGS[$idx]}" - value=$(generate_value "$param") - # Output the export statement - echo "export ${param}='${value}'" - # Log to stderr - echo "# ${param}=${value}" >&2 -done - -echo "# Export complete!" >&2 - diff --git a/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java b/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java index 332a406865d..205dec18af3 100644 --- a/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java +++ b/remote-config/remote-config-core/src/main/java/datadog/remoteconfig/DefaultConfigurationPoller.java @@ -51,7 +51,6 @@ import okio.ByteString; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.antithesis.sdk.Assert; /** Handles polling debugger configuration from datadog agent/Remote Configuration */ public class DefaultConfigurationPoller @@ -283,8 +282,6 @@ private boolean initialize() { } catch (Exception e) { // We can't recover from this, so we'll not try to initialize again. log.error("Remote configuration poller initialization failed", e); - log.debug("ANTITHESIS_ASSERT: Remote configuration poller initialization failed (unreachable)", e); - Assert.unreachable("Remote configuration poller initialization failed", null); fatalOnInitialization = true; } return true; @@ -382,8 +379,6 @@ private void handleAgentResponse(ResponseBody body) { } catch (Exception e) { // no error can be reported, as we don't have the data client.state.targets_version avail ratelimitedLogger.warn("Error parsing remote config response", e); - log.debug("ANTITHESIS_ASSERT: Error parsing remote config response (unreachable)", e); - Assert.unreachable("Error parsing remote config response", null); return; } @@ -451,8 +446,6 @@ private void runConfigurationEndListener( ConfigurationEndListener listener, List errors) { try { listener.onConfigurationEnd(); - log.debug("ANTITHESIS_ASSERT: Configuration end listener should always be reachable (reachable)"); - Assert.reachable("Configuration end listener should always be reachable", null); } catch (ReportableException re) { errors.add(re); } catch (RuntimeException rte) { @@ -461,8 +454,6 @@ private void runConfigurationEndListener( // is about combining configuration from different products ratelimitedLogger.warn( "Error running configuration listener {}: {}", listener, rte.getMessage(), rte); - log.debug("ANTITHESIS_ASSERT: Error running configuration listener (unreachable)", rte); - Assert.unreachable("Error running configuration listener", null); } } diff --git a/report-config-types.sh b/report-config-types.sh deleted file mode 100755 index 47acc646087..00000000000 --- a/report-config-types.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash - -############################################################################## -# DD-Trace-Java Configuration Type Reporter -# -# Analyzes all configuration parameters and reports their detected types -# based on naming patterns used by the fuzzer. -############################################################################## - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CONFIG_FILE="${SCRIPT_DIR}/metadata/supported-configurations.json" - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -NC='\033[0m' - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} DD-Trace-Java Configuration Type Report${NC}" -echo -e "${BLUE}==================================================================${NC}" -echo "" - -# Check if config file exists -if [ ! -f "$CONFIG_FILE" ]; then - echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}" - exit 1 -fi - -# Extract all configuration keys -CONFIGS=($(jq -r '.supportedConfigurations | keys[]' "$CONFIG_FILE")) -TOTAL=${#CONFIGS[@]} - -echo -e "${GREEN}Analyzing $TOTAL configuration parameters...${NC}" -echo "" - -# Initialize counters -boolean_count=0 -integer_count=0 -float_count=0 -string_count=0 - -boolean_params=() -integer_params=() -float_params=() -string_params=() - -############################################################################## -# Classify each parameter -############################################################################## - -for param in "${CONFIGS[@]}"; do - # Determine type based on parameter name patterns (same logic as fuzzer) - if [[ "$param" =~ ENABLED$|^DD_TRACE_ENABLED$|DEBUG$|COLLECT|HEADER_COLLECTION|REPORTING|SPLIT_BY ]]; then - ((boolean_count++)) - boolean_params+=("$param") - elif [[ "$param" =~ PORT$|TIMEOUT$|DELAY$|SIZE$|LIMIT$|MAX_|DEPTH$|COUNT$|QUEUE_SIZE$|BUFFER ]]; then - ((integer_count++)) - integer_params+=("$param") - elif [[ "$param" =~ SAMPLE_RATE$|_RATE$ ]] && [[ ! "$param" =~ TRACE_RATE_LIMIT ]]; then - ((float_count++)) - float_params+=("$param") - elif [[ "$param" =~ INTERVAL$ ]]; then - if [[ "$param" =~ FLUSH_INTERVAL ]]; then - ((float_count++)) - float_params+=("$param") - else - ((integer_count++)) - integer_params+=("$param") - fi - else - ((string_count++)) - string_params+=("$param") - fi -done - -############################################################################## -# Display Summary -############################################################################## - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} Type Distribution${NC}" -echo -e "${BLUE}==================================================================${NC}" - -echo -e "${GREEN}Boolean parameters:${NC} $boolean_count ($(( boolean_count * 100 / TOTAL ))%)" -echo -e "${GREEN}Integer parameters:${NC} $integer_count ($(( integer_count * 100 / TOTAL ))%)" -echo -e "${GREEN}Float parameters:${NC} $float_count ($(( float_count * 100 / TOTAL ))%)" -echo -e "${GREEN}String parameters:${NC} $string_count ($(( string_count * 100 / TOTAL ))%)" -echo "" - -############################################################################## -# Display samples -############################################################################## - -echo -e "${BLUE}==================================================================${NC}" -echo -e "${BLUE} Sample Parameters by Type${NC}" -echo -e "${BLUE}==================================================================${NC}" - -echo -e "${YELLOW}Boolean Parameters (sample of ${boolean_count}):${NC}" -printf '%s\n' "${boolean_params[@]}" | head -10 -if [ ${#boolean_params[@]} -gt 10 ]; then - echo " ... and $((boolean_count - 10)) more" -fi -echo "" - -echo -e "${YELLOW}Integer Parameters (sample of ${integer_count}):${NC}" -printf '%s\n' "${integer_params[@]}" | head -10 -if [ ${#integer_params[@]} -gt 10 ]; then - echo " ... and $((integer_count - 10)) more" -fi -echo "" - -echo -e "${YELLOW}Float Parameters (sample of ${float_count}):${NC}" -printf '%s\n' "${float_params[@]}" | head -10 -if [ ${#float_params[@]} -gt 10 ]; then - echo " ... and $((float_count - 10)) more" -fi -echo "" - -echo -e "${YELLOW}String Parameters (sample of ${string_count}):${NC}" -printf '%s\n' "${string_params[@]}" | head -20 -if [ ${#string_params[@]} -gt 20 ]; then - echo " ... and $((string_count - 20)) more" -fi -echo "" - -############################################################################## -# Export options -############################################################################## - -if [ "$1" = "--export" ]; then - OUTPUT_FILE="${SCRIPT_DIR}/config-types-report.json" - - echo -e "${BLUE}Exporting to JSON: $OUTPUT_FILE${NC}" - - jq -n \ - --arg total "$TOTAL" \ - --arg boolean_count "$boolean_count" \ - --arg integer_count "$integer_count" \ - --arg float_count "$float_count" \ - --arg string_count "$string_count" \ - --argjson boolean "$(printf '%s\n' "${boolean_params[@]}" | jq -R . | jq -s .)" \ - --argjson integer "$(printf '%s\n' "${integer_params[@]}" | jq -R . | jq -s .)" \ - --argjson float "$(printf '%s\n' "${float_params[@]}" | jq -R . | jq -s .)" \ - --argjson string "$(printf '%s\n' "${string_params[@]}" | jq -R . | jq -s .)" \ - '{ - total: $total, - summary: { - boolean: $boolean_count, - integer: $integer_count, - float: $float_count, - string: $string_count - }, - parameters: { - boolean: $boolean, - integer: $integer, - float: $float, - string: $string - } - }' > "$OUTPUT_FILE" - - echo -e "${GREEN}Report exported successfully!${NC}" -fi - -echo -e "${BLUE}==================================================================${NC}" -echo "" -echo "To export this report as JSON, run:" -echo " $0 --export" -echo "" - diff --git a/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java b/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java index 1abd3d18067..91ec8c52162 100644 --- a/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java +++ b/telemetry/src/main/java/datadog/telemetry/dependency/DependencyResolver.java @@ -1,8 +1,5 @@ package datadog.telemetry.dependency; -import com.antithesis.sdk.Assert; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -44,18 +41,6 @@ static List internalResolve(final URI uri) throws IOException { return Collections.emptyList(); } if (metadata.isDirectory) { - // Antithesis: Track when dependencies are extracted from directories - ObjectNode directoryDetails = JsonNodeFactory.instance.objectNode(); - directoryDetails.put("uri", uri.toString()); - directoryDetails.put("scheme", scheme); - directoryDetails.put("is_directory", true); - - log.debug("ANTITHESIS_ASSERT: Directory dependency extraction attempted (sometimes) - uri: {}", uri); - Assert.sometimes( - metadata.isDirectory, - "Directory dependencies are encountered - tracking unsupported dependency type", - directoryDetails); - log.debug("Extracting dependencies from directories is not supported: {}", uri); return Collections.emptyList(); } From 34a001965b72d359ea84b602bdd759d9e2d3928c Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 17 Dec 2025 16:35:38 +0100 Subject: [PATCH 13/15] Fix Java 8 compatibility: Replace Map.of() with HashMap Replaced all java.util.Map.of() calls with HashMap initialization to support Java 8 compilation. Map.of() was introduced in Java 9. Fixed in: - CoreTracer.java - RemoteWriter.java - PayloadDispatcherImpl.java --- .../common/writer/PayloadDispatcherImpl.java | 50 ++++++---------- .../trace/common/writer/RemoteWriter.java | 60 +++++++------------ .../java/datadog/trace/core/CoreTracer.java | 32 ++++------ 3 files changed, 53 insertions(+), 89 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index 52eff743814..64fb3c8fc27 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -110,49 +110,37 @@ public void accept(int messageCount, ByteBuffer buffer) { healthMetrics.onSerialize(sizeInBytes); // Antithesis: Track all send attempts - Assert.sometimes( - true, - "trace_payloads_being_sent", - java.util.Map.of( - "trace_count", messageCount, - "payload_size_bytes", sizeInBytes, - "dropped_traces_in_payload", payload.droppedTraces(), - "dropped_spans_in_payload", payload.droppedSpans() - ) - ); + java.util.Map sendAttemptDetails = new java.util.HashMap<>(); + sendAttemptDetails.put("trace_count", messageCount); + sendAttemptDetails.put("payload_size_bytes", sizeInBytes); + sendAttemptDetails.put("dropped_traces_in_payload", payload.droppedTraces()); + sendAttemptDetails.put("dropped_spans_in_payload", payload.droppedSpans()); + Assert.sometimes(true, "trace_payloads_being_sent", sendAttemptDetails); RemoteApi.Response response = api.sendSerializedTraces(payload); mapper.reset(); if (response.success()) { // Antithesis: Track successful sends - Assert.sometimes( - true, - "traces_sent_successfully", - java.util.Map.of( - "decision", "sent_success", - "trace_count", messageCount, - "payload_size_bytes", sizeInBytes, - "http_status", response.status() - ) - ); + java.util.Map successDetails = new java.util.HashMap<>(); + successDetails.put("decision", "sent_success"); + successDetails.put("trace_count", messageCount); + successDetails.put("payload_size_bytes", sizeInBytes); + successDetails.put("http_status", response.status()); + Assert.sometimes(true, "traces_sent_successfully", successDetails); if (log.isDebugEnabled()) { log.debug("Successfully sent {} traces to the API", messageCount); } healthMetrics.onSend(messageCount, sizeInBytes, response); } else { // Antithesis: Track failed sends - Assert.sometimes( - true, - "traces_failed_to_send", - java.util.Map.of( - "decision", "dropped_send_failed", - "trace_count", messageCount, - "payload_size_bytes", sizeInBytes, - "http_status", response.status(), - "has_exception", response.exception() != null - ) - ); + java.util.Map failedDetails = new java.util.HashMap<>(); + failedDetails.put("decision", "dropped_send_failed"); + failedDetails.put("trace_count", messageCount); + failedDetails.put("payload_size_bytes", sizeInBytes); + failedDetails.put("http_status", response.status()); + failedDetails.put("has_exception", response.exception() != null); + Assert.sometimes(true, "traces_failed_to_send", failedDetails); if (log.isDebugEnabled()) { log.debug( "Failed to send {} traces of size {} bytes to the API", messageCount, sizeInBytes); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java index d508bf86343..cecbd2c660a 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java @@ -70,14 +70,10 @@ protected RemoteWriter( public void write(final List trace) { if (closed) { // Antithesis: Track traces dropped during shutdown - Assert.sometimes( - true, - "trace_dropped_writer_closed", - java.util.Map.of( - "decision", "dropped_shutdown", - "span_count", trace.size() - ) - ); + java.util.Map shutdownDetails = new java.util.HashMap<>(); + shutdownDetails.put("decision", "dropped_shutdown"); + shutdownDetails.put("span_count", trace.size()); + Assert.sometimes(true, "trace_dropped_writer_closed", shutdownDetails); // We can't add events after shutdown otherwise it will never complete shutting down. log.debug("Dropped due to shutdown: {}", trace); handleDroppedTrace(trace); @@ -91,16 +87,12 @@ public void write(final List trace) { switch (traceProcessingWorker.publish(root, samplingPriority, trace)) { case ENQUEUED_FOR_SERIALIZATION: // Antithesis: Track traces enqueued for sending - Assert.sometimes( - true, - "trace_enqueued_for_send", - java.util.Map.of( - "decision", "enqueued", - "trace_id", root.getTraceId().toString(), - "span_count", trace.size(), - "sampling_priority", samplingPriority - ) - ); + java.util.Map enqueuedDetails = new java.util.HashMap<>(); + enqueuedDetails.put("decision", "enqueued"); + enqueuedDetails.put("trace_id", root.getTraceId().toString()); + enqueuedDetails.put("span_count", trace.size()); + enqueuedDetails.put("sampling_priority", samplingPriority); + Assert.sometimes(true, "trace_enqueued_for_send", enqueuedDetails); log.debug("Enqueued for serialization: {}", trace); healthMetrics.onPublish(trace, samplingPriority); break; @@ -109,31 +101,23 @@ public void write(final List trace) { break; case DROPPED_BY_POLICY: // Antithesis: Track traces dropped by policy - Assert.sometimes( - true, - "trace_dropped_by_policy", - java.util.Map.of( - "decision", "dropped_policy", - "trace_id", root.getTraceId().toString(), - "span_count", trace.size(), - "sampling_priority", samplingPriority - ) - ); + java.util.Map policyDetails = new java.util.HashMap<>(); + policyDetails.put("decision", "dropped_policy"); + policyDetails.put("trace_id", root.getTraceId().toString()); + policyDetails.put("span_count", trace.size()); + policyDetails.put("sampling_priority", samplingPriority); + Assert.sometimes(true, "trace_dropped_by_policy", policyDetails); log.debug("Dropped by the policy: {}", trace); handleDroppedTrace(trace); break; case DROPPED_BUFFER_OVERFLOW: // Antithesis: Track traces dropped due to buffer overflow - Assert.sometimes( - true, - "trace_dropped_buffer_overflow", - java.util.Map.of( - "decision", "dropped_buffer_overflow", - "trace_id", root.getTraceId().toString(), - "span_count", trace.size(), - "sampling_priority", samplingPriority - ) - ); + java.util.Map overflowDetails = new java.util.HashMap<>(); + overflowDetails.put("decision", "dropped_buffer_overflow"); + overflowDetails.put("trace_id", root.getTraceId().toString()); + overflowDetails.put("span_count", trace.size()); + overflowDetails.put("sampling_priority", samplingPriority); + Assert.sometimes(true, "trace_dropped_buffer_overflow", overflowDetails); if (log.isDebugEnabled()) { log.debug("Dropped due to a buffer overflow: {}", trace); } else { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java index a6b188d1224..71f7ae34e31 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java @@ -1249,29 +1249,21 @@ void write(final List trace) { boolean published = forceKeep || traceCollector.sample(spanToSample); if (published) { // Antithesis: Track traces accepted by sampling - Assert.sometimes( - true, - "trace_accepted_by_sampling", - java.util.Map.of( - "decision", "accepted", - "trace_id", writtenTrace.get(0).getTraceId().toString(), - "span_count", writtenTrace.size(), - "sampling_priority", spanToSample.samplingPriority() - ) - ); + java.util.Map acceptedDetails = new java.util.HashMap<>(); + acceptedDetails.put("decision", "accepted"); + acceptedDetails.put("trace_id", writtenTrace.get(0).getTraceId().toString()); + acceptedDetails.put("span_count", writtenTrace.size()); + acceptedDetails.put("sampling_priority", spanToSample.samplingPriority()); + Assert.sometimes(true, "trace_accepted_by_sampling", acceptedDetails); writer.write(writtenTrace); } else { // Antithesis: Track traces dropped by sampling - Assert.sometimes( - true, - "trace_dropped_by_sampling", - java.util.Map.of( - "decision", "dropped_sampling", - "trace_id", writtenTrace.get(0).getTraceId().toString(), - "span_count", writtenTrace.size(), - "sampling_priority", spanToSample.samplingPriority() - ) - ); + java.util.Map droppedDetails = new java.util.HashMap<>(); + droppedDetails.put("decision", "dropped_sampling"); + droppedDetails.put("trace_id", writtenTrace.get(0).getTraceId().toString()); + droppedDetails.put("span_count", writtenTrace.size()); + droppedDetails.put("sampling_priority", spanToSample.samplingPriority()); + Assert.sometimes(true, "trace_dropped_by_sampling", droppedDetails); // with span streaming this won't work - it needs to be changed // to track an effective sampling rate instead, however, tests // checking that a hard reference on a continuation prevents From e00a3ce7052104ed398b515bac5a83f6ba81d544 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 17 Dec 2025 16:43:19 +0100 Subject: [PATCH 14/15] Fix Antithesis SDK compatibility: Use Jackson ObjectNode instead of Map The Antithesis SDK expects Jackson ObjectNode, not Map. Replaced HashMap with ObjectNode using JsonNodeFactory. Fixed in: - CoreTracer.java - RemoteWriter.java - PayloadDispatcherImpl.java --- .../trace/common/writer/PayloadDispatcherImpl.java | 8 +++++--- .../java/datadog/trace/common/writer/RemoteWriter.java | 10 ++++++---- .../src/main/java/datadog/trace/core/CoreTracer.java | 6 ++++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index 64fb3c8fc27..7c196865ca7 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -1,6 +1,8 @@ package datadog.trace.common.writer; import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.monitor.Monitoring; import datadog.communication.monitor.Recording; import datadog.communication.serialization.ByteBufferConsumer; @@ -110,7 +112,7 @@ public void accept(int messageCount, ByteBuffer buffer) { healthMetrics.onSerialize(sizeInBytes); // Antithesis: Track all send attempts - java.util.Map sendAttemptDetails = new java.util.HashMap<>(); + ObjectNode sendAttemptDetails = JsonNodeFactory.instance.objectNode(); sendAttemptDetails.put("trace_count", messageCount); sendAttemptDetails.put("payload_size_bytes", sizeInBytes); sendAttemptDetails.put("dropped_traces_in_payload", payload.droppedTraces()); @@ -122,7 +124,7 @@ public void accept(int messageCount, ByteBuffer buffer) { if (response.success()) { // Antithesis: Track successful sends - java.util.Map successDetails = new java.util.HashMap<>(); + ObjectNode successDetails = JsonNodeFactory.instance.objectNode(); successDetails.put("decision", "sent_success"); successDetails.put("trace_count", messageCount); successDetails.put("payload_size_bytes", sizeInBytes); @@ -134,7 +136,7 @@ public void accept(int messageCount, ByteBuffer buffer) { healthMetrics.onSend(messageCount, sizeInBytes, response); } else { // Antithesis: Track failed sends - java.util.Map failedDetails = new java.util.HashMap<>(); + ObjectNode failedDetails = JsonNodeFactory.instance.objectNode(); failedDetails.put("decision", "dropped_send_failed"); failedDetails.put("trace_count", messageCount); failedDetails.put("payload_size_bytes", sizeInBytes); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java index cecbd2c660a..6c21f320468 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/RemoteWriter.java @@ -4,6 +4,8 @@ import static java.util.concurrent.TimeUnit.MINUTES; import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.trace.core.DDSpan; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.relocate.api.RatelimitedLogger; @@ -70,7 +72,7 @@ protected RemoteWriter( public void write(final List trace) { if (closed) { // Antithesis: Track traces dropped during shutdown - java.util.Map shutdownDetails = new java.util.HashMap<>(); + ObjectNode shutdownDetails = JsonNodeFactory.instance.objectNode(); shutdownDetails.put("decision", "dropped_shutdown"); shutdownDetails.put("span_count", trace.size()); Assert.sometimes(true, "trace_dropped_writer_closed", shutdownDetails); @@ -87,7 +89,7 @@ public void write(final List trace) { switch (traceProcessingWorker.publish(root, samplingPriority, trace)) { case ENQUEUED_FOR_SERIALIZATION: // Antithesis: Track traces enqueued for sending - java.util.Map enqueuedDetails = new java.util.HashMap<>(); + ObjectNode enqueuedDetails = JsonNodeFactory.instance.objectNode(); enqueuedDetails.put("decision", "enqueued"); enqueuedDetails.put("trace_id", root.getTraceId().toString()); enqueuedDetails.put("span_count", trace.size()); @@ -101,7 +103,7 @@ public void write(final List trace) { break; case DROPPED_BY_POLICY: // Antithesis: Track traces dropped by policy - java.util.Map policyDetails = new java.util.HashMap<>(); + ObjectNode policyDetails = JsonNodeFactory.instance.objectNode(); policyDetails.put("decision", "dropped_policy"); policyDetails.put("trace_id", root.getTraceId().toString()); policyDetails.put("span_count", trace.size()); @@ -112,7 +114,7 @@ public void write(final List trace) { break; case DROPPED_BUFFER_OVERFLOW: // Antithesis: Track traces dropped due to buffer overflow - java.util.Map overflowDetails = new java.util.HashMap<>(); + ObjectNode overflowDetails = JsonNodeFactory.instance.objectNode(); overflowDetails.put("decision", "dropped_buffer_overflow"); overflowDetails.put("trace_id", root.getTraceId().toString()); overflowDetails.put("span_count", trace.size()); diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java index 71f7ae34e31..21c5f029e74 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java @@ -20,6 +20,8 @@ import static java.util.concurrent.TimeUnit.SECONDS; import com.antithesis.sdk.Assert; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import datadog.communication.ddagent.ExternalAgentLauncher; @@ -1249,7 +1251,7 @@ void write(final List trace) { boolean published = forceKeep || traceCollector.sample(spanToSample); if (published) { // Antithesis: Track traces accepted by sampling - java.util.Map acceptedDetails = new java.util.HashMap<>(); + ObjectNode acceptedDetails = JsonNodeFactory.instance.objectNode(); acceptedDetails.put("decision", "accepted"); acceptedDetails.put("trace_id", writtenTrace.get(0).getTraceId().toString()); acceptedDetails.put("span_count", writtenTrace.size()); @@ -1258,7 +1260,7 @@ void write(final List trace) { writer.write(writtenTrace); } else { // Antithesis: Track traces dropped by sampling - java.util.Map droppedDetails = new java.util.HashMap<>(); + ObjectNode droppedDetails = JsonNodeFactory.instance.objectNode(); droppedDetails.put("decision", "dropped_sampling"); droppedDetails.put("trace_id", writtenTrace.get(0).getTraceId().toString()); droppedDetails.put("span_count", writtenTrace.size()); From 18d42b5fc3821d137953913934c0aabeb70e7482 Mon Sep 17 00:00:00 2001 From: Roberto Montero Date: Wed, 17 Dec 2025 17:04:02 +0100 Subject: [PATCH 15/15] Fix OptionalInt extraction for HTTP status in Antithesis assertions --- .../datadog/trace/common/writer/PayloadDispatcherImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java index 7c196865ca7..a9c8710b0f5 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/PayloadDispatcherImpl.java @@ -128,7 +128,7 @@ public void accept(int messageCount, ByteBuffer buffer) { successDetails.put("decision", "sent_success"); successDetails.put("trace_count", messageCount); successDetails.put("payload_size_bytes", sizeInBytes); - successDetails.put("http_status", response.status()); + successDetails.put("http_status", response.status().orElse(-1)); Assert.sometimes(true, "traces_sent_successfully", successDetails); if (log.isDebugEnabled()) { log.debug("Successfully sent {} traces to the API", messageCount); @@ -140,7 +140,7 @@ public void accept(int messageCount, ByteBuffer buffer) { failedDetails.put("decision", "dropped_send_failed"); failedDetails.put("trace_count", messageCount); failedDetails.put("payload_size_bytes", sizeInBytes); - failedDetails.put("http_status", response.status()); + failedDetails.put("http_status", response.status().orElse(-1)); failedDetails.put("has_exception", response.exception() != null); Assert.sometimes(true, "traces_failed_to_send", failedDetails); if (log.isDebugEnabled()) {