Skip to content

Commit b912475

Browse files
amarzialimhlidd
authored andcommitted
Defer metrics aggregator classloading to save startup time (#9590)
* Revert "Lazily obtain feature discovery when starting client metrics (#9548)" This reverts commit 561eb5e. * Defer metrics aggregator classloading to save startup time
1 parent eac9cb2 commit b912475

File tree

8 files changed

+75
-96
lines changed

8 files changed

+75
-96
lines changed

dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/benchmark/StaticEventLogger.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ private static String getAgentVersion() {
9797
for (int c = reader.read(); c != -1; c = reader.read()) {
9898
sb.append((char) c);
9999
}
100-
} catch (IOException e) {
100+
} catch (Throwable ignored) {
101101
// swallow exception
102102
return null;
103103
}

dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import static java.util.concurrent.TimeUnit.SECONDS;
55

66
import datadog.communication.ddagent.DDAgentFeaturesDiscovery;
7-
import datadog.communication.ddagent.SharedCommunicationObjects;
87
import datadog.communication.monitor.Monitoring;
98
import datadog.trace.api.WellKnownTags;
109
import datadog.trace.core.CoreSpan;
@@ -33,25 +32,20 @@
3332
@OutputTimeUnit(MICROSECONDS)
3433
@Fork(value = 1)
3534
public class ConflatingMetricsAggregatorBenchmark {
36-
private final SharedCommunicationObjects sco = new SharedCommunicationObjects();
37-
private final ConflatingMetricsAggregator aggregator;
35+
private final DDAgentFeaturesDiscovery featuresDiscovery =
36+
new FixedAgentFeaturesDiscovery(
37+
Collections.singleton("peer.hostname"), Collections.emptySet());
38+
private final ConflatingMetricsAggregator aggregator =
39+
new ConflatingMetricsAggregator(
40+
new WellKnownTags("", "", "", "", "", ""),
41+
Collections.emptySet(),
42+
featuresDiscovery,
43+
HealthMetrics.NO_OP,
44+
new NullSink(),
45+
2048,
46+
2048);
3847
private final List<CoreSpan<?>> spans = generateTrace(64);
3948

40-
public ConflatingMetricsAggregatorBenchmark() {
41-
sco.setFeaturesDiscovery(
42-
new FixedAgentFeaturesDiscovery(
43-
Collections.singleton("peer.hostname"), Collections.emptySet()));
44-
aggregator =
45-
new ConflatingMetricsAggregator(
46-
new WellKnownTags("", "", "", "", "", ""),
47-
Collections.emptySet(),
48-
sco,
49-
HealthMetrics.NO_OP,
50-
new NullSink(),
51-
2048,
52-
2048);
53-
}
54-
5549
static List<CoreSpan<?>> generateTrace(int len) {
5650
final List<CoreSpan<?>> trace = new ArrayList<>();
5751
for (int i = 0; i < len; i++) {

dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,7 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve
9898
private final Aggregator aggregator;
9999
private final long reportingInterval;
100100
private final TimeUnit reportingIntervalTimeUnit;
101-
private final SharedCommunicationObjects sharedCommunicationObjects;
102-
private volatile DDAgentFeaturesDiscovery features;
101+
private final DDAgentFeaturesDiscovery features;
103102
private final HealthMetrics healthMetrics;
104103

105104
private volatile AgentTaskScheduler.Scheduled<?> cancellation;
@@ -111,7 +110,7 @@ public ConflatingMetricsAggregator(
111110
this(
112111
config.getWellKnownTags(),
113112
config.getMetricsIgnoredResources(),
114-
sharedCommunicationObjects,
113+
sharedCommunicationObjects.featuresDiscovery(config),
115114
healthMetrics,
116115
new OkHttpSink(
117116
sharedCommunicationObjects.okHttpClient,
@@ -127,15 +126,15 @@ public ConflatingMetricsAggregator(
127126
ConflatingMetricsAggregator(
128127
WellKnownTags wellKnownTags,
129128
Set<String> ignoredResources,
130-
SharedCommunicationObjects sharedCommunicationObjects,
129+
DDAgentFeaturesDiscovery features,
131130
HealthMetrics healthMetric,
132131
Sink sink,
133132
int maxAggregates,
134133
int queueSize) {
135134
this(
136135
wellKnownTags,
137136
ignoredResources,
138-
sharedCommunicationObjects,
137+
features,
139138
healthMetric,
140139
sink,
141140
maxAggregates,
@@ -147,7 +146,7 @@ public ConflatingMetricsAggregator(
147146
ConflatingMetricsAggregator(
148147
WellKnownTags wellKnownTags,
149148
Set<String> ignoredResources,
150-
SharedCommunicationObjects sharedCommunicationObjects,
149+
DDAgentFeaturesDiscovery features,
151150
HealthMetrics healthMetric,
152151
Sink sink,
153152
int maxAggregates,
@@ -156,7 +155,7 @@ public ConflatingMetricsAggregator(
156155
TimeUnit timeUnit) {
157156
this(
158157
ignoredResources,
159-
sharedCommunicationObjects,
158+
features,
160159
healthMetric,
161160
sink,
162161
new SerializingMetricWriter(wellKnownTags, sink),
@@ -168,7 +167,7 @@ public ConflatingMetricsAggregator(
168167

169168
ConflatingMetricsAggregator(
170169
Set<String> ignoredResources,
171-
SharedCommunicationObjects sharedCommunicationObjects,
170+
DDAgentFeaturesDiscovery features,
172171
HealthMetrics healthMetric,
173172
Sink sink,
174173
MetricWriter metricWriter,
@@ -181,7 +180,7 @@ public ConflatingMetricsAggregator(
181180
this.batchPool = new SpmcArrayQueue<>(maxAggregates);
182181
this.pending = new NonBlockingHashMap<>(maxAggregates * 4 / 3);
183182
this.keys = new NonBlockingHashMap<>();
184-
this.sharedCommunicationObjects = sharedCommunicationObjects;
183+
this.features = features;
185184
this.healthMetrics = healthMetric;
186185
this.sink = sink;
187186
this.aggregator =
@@ -199,18 +198,6 @@ public ConflatingMetricsAggregator(
199198
this.reportingIntervalTimeUnit = timeUnit;
200199
}
201200

202-
private DDAgentFeaturesDiscovery featuresDiscovery() {
203-
DDAgentFeaturesDiscovery ret = features;
204-
if (ret != null) {
205-
return ret;
206-
}
207-
// no need to synchronise here since it's already done in sharedCommunicationObject.
208-
// At worst, we'll assign multiple time the variable but it will be the same object
209-
ret = sharedCommunicationObjects.featuresDiscovery(Config.get());
210-
features = ret;
211-
return ret;
212-
}
213-
214201
@Override
215202
public void start() {
216203
sink.register(this);
@@ -226,6 +213,13 @@ public void start() {
226213
log.debug("started metrics aggregator");
227214
}
228215

216+
private boolean isMetricsEnabled() {
217+
if (features.getMetricsEndpoint() == null) {
218+
features.discoverIfOutdated();
219+
}
220+
return features.supportsMetrics();
221+
}
222+
229223
@Override
230224
public boolean report() {
231225
boolean published;
@@ -242,7 +236,8 @@ public boolean report() {
242236

243237
@Override
244238
public Future<Boolean> forceReport() {
245-
if (!featuresDiscovery().supportsMetrics()) {
239+
// Ensure the feature is enabled
240+
if (!isMetricsEnabled()) {
246241
return CompletableFuture.completedFuture(false);
247242
}
248243
// Wait for the thread to start
@@ -278,7 +273,6 @@ public Future<Boolean> forceReport() {
278273
public boolean publish(List<? extends CoreSpan<?>> trace) {
279274
boolean forceKeep = false;
280275
int counted = 0;
281-
final DDAgentFeaturesDiscovery features = featuresDiscovery();
282276
if (features.supportsMetrics()) {
283277
for (CoreSpan<?> span : trace) {
284278
boolean isTopLevel = span.isTopLevel();
@@ -289,7 +283,7 @@ public boolean publish(List<? extends CoreSpan<?>> trace) {
289283
break;
290284
}
291285
counted++;
292-
forceKeep |= publish(span, isTopLevel, features);
286+
forceKeep |= publish(span, isTopLevel);
293287
}
294288
}
295289
healthMetrics.onClientStatTraceComputed(
@@ -311,7 +305,7 @@ private boolean spanKindEligible(CoreSpan<?> span) {
311305
return spanKind != null && ELIGIBLE_SPAN_KINDS_FOR_METRICS.contains(spanKind.toString());
312306
}
313307

314-
private boolean publish(CoreSpan<?> span, boolean isTopLevel, DDAgentFeaturesDiscovery features) {
308+
private boolean publish(CoreSpan<?> span, boolean isTopLevel) {
315309
final CharSequence spanKind = span.getTag(SPAN_KIND, "");
316310
MetricKey newKey =
317311
new MetricKey(
@@ -324,7 +318,7 @@ private boolean publish(CoreSpan<?> span, boolean isTopLevel, DDAgentFeaturesDis
324318
span.getParentId() == 0,
325319
SPAN_KINDS.computeIfAbsent(
326320
spanKind, UTF8BytesString::create), // save repeated utf8 conversions
327-
getPeerTags(span, spanKind.toString(), features));
321+
getPeerTags(span, spanKind.toString()));
328322
boolean isNewKey = false;
329323
MetricKey key = keys.putIfAbsent(newKey, newKey);
330324
if (null == key) {
@@ -359,8 +353,7 @@ private boolean publish(CoreSpan<?> span, boolean isTopLevel, DDAgentFeaturesDis
359353
return isNewKey || span.getError() > 0;
360354
}
361355

362-
private List<UTF8BytesString> getPeerTags(
363-
CoreSpan<?> span, String spanKind, DDAgentFeaturesDiscovery features) {
356+
private List<UTF8BytesString> getPeerTags(CoreSpan<?> span, String spanKind) {
364357
if (ELIGIBLE_SPAN_KINDS_FOR_PEER_AGGREGATION.contains(spanKind)) {
365358
List<UTF8BytesString> peerTags = new ArrayList<>();
366359
for (String peerTag : features.peerTags()) {
@@ -424,7 +417,8 @@ public void onEvent(EventType eventType, String message) {
424417
switch (eventType) {
425418
case DOWNGRADED:
426419
log.debug("Agent downgrade was detected");
427-
AgentTaskScheduler.get().execute(this::disable);
420+
disable();
421+
healthMetrics.onClientStatDowngraded();
428422
break;
429423
case BAD_PAYLOAD:
430424
log.debug("bad metrics payload sent to trace agent: {}", message);
@@ -440,11 +434,9 @@ public void onEvent(EventType eventType, String message) {
440434
}
441435

442436
private void disable() {
443-
final DDAgentFeaturesDiscovery features = featuresDiscovery();
444437
features.discover();
445438
if (!features.supportsMetrics()) {
446439
log.debug("Disabling metric reporting because an agent downgrade was detected");
447-
healthMetrics.onClientStatDowngraded();
448440
this.pending.clear();
449441
this.batchPool.clear();
450442
this.inbox.clear();

dd-trace-core/src/main/java/datadog/trace/common/metrics/NoOpMetricsAggregator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
public final class NoOpMetricsAggregator implements MetricsAggregator {
1111

12-
static final NoOpMetricsAggregator INSTANCE = new NoOpMetricsAggregator();
12+
public static final NoOpMetricsAggregator INSTANCE = new NoOpMetricsAggregator();
1313

1414
@Override
1515
public void start() {}

dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
import datadog.trace.civisibility.interceptor.CiVisibilityTraceInterceptor;
7373
import datadog.trace.common.GitMetadataTraceInterceptor;
7474
import datadog.trace.common.metrics.MetricsAggregator;
75+
import datadog.trace.common.metrics.NoOpMetricsAggregator;
7576
import datadog.trace.common.sampling.Sampler;
7677
import datadog.trace.common.sampling.SingleSpanSampler;
7778
import datadog.trace.common.sampling.SpanSamplingRules;
@@ -182,7 +183,7 @@ public static CoreTracerBuilder builder() {
182183
/** Scope manager is in charge of managing the scopes from which spans are created */
183184
final ContinuableScopeManager scopeManager;
184185

185-
final MetricsAggregator metricsAggregator;
186+
volatile MetricsAggregator metricsAggregator;
186187

187188
/** Initial static configuration associated with the tracer. */
188189
final Config initialConfig;
@@ -783,16 +784,26 @@ private CoreTracer(
783784
pendingTraceBuffer.start();
784785

785786
sharedCommunicationObjects.whenReady(this.writer::start);
786-
787-
metricsAggregator =
788-
createMetricsAggregator(config, sharedCommunicationObjects, this.healthMetrics);
789-
// Schedule the metrics aggregator to begin reporting after a random delay of 1 to 10 seconds
790-
// (using milliseconds granularity.) This avoids a fleet of traced applications starting at the
791-
// same time from sending metrics in sync.
787+
// temporary assign a no-op instance. The final one will be resolved when the discovery will be
788+
// allowed
789+
metricsAggregator = NoOpMetricsAggregator.INSTANCE;
790+
final SharedCommunicationObjects sco = sharedCommunicationObjects;
791+
// asynchronously create the aggregator to avoid triggering expensive classloading during the
792+
// tracer initialisation.
792793
sharedCommunicationObjects.whenReady(
793794
() ->
794795
AgentTaskScheduler.get()
795-
.scheduleWithJitter(MetricsAggregator::start, metricsAggregator, 1, SECONDS));
796+
.execute(
797+
() -> {
798+
metricsAggregator = createMetricsAggregator(config, sco, this.healthMetrics);
799+
// Schedule the metrics aggregator to begin reporting after a random delay of
800+
// 1 to 10 seconds (using milliseconds granularity.)
801+
// This avoids a fleet of traced applications starting at the same time from
802+
// sending metrics in sync.
803+
AgentTaskScheduler.get()
804+
.scheduleWithJitter(
805+
MetricsAggregator::start, metricsAggregator, 1, SECONDS);
806+
}));
796807

797808
if (dataStreamsMonitoring == null) {
798809
this.dataStreamsMonitoring =

0 commit comments

Comments
 (0)