Skip to content

Commit b26c3c2

Browse files
committed
save
1 parent 0b20f03 commit b26c3c2

File tree

3 files changed

+42
-2
lines changed

3 files changed

+42
-2
lines changed

be/src/agent/task_worker_pool.cpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ std::mutex s_task_signatures_mtx;
111111
std::unordered_map<TTaskType::type, std::unordered_set<int64_t>> s_task_signatures;
112112

113113
std::atomic_ulong s_report_version(time(nullptr) * 100000);
114+
std::atomic<int64_t> s_tablet_report_failure_start_time(0);
114115

115116
void increase_report_version() {
116117
s_report_version.fetch_add(1, std::memory_order_relaxed);
@@ -1181,13 +1182,20 @@ void report_tablet_callback(StorageEngine& engine, const ClusterInfo* cluster_in
11811182
}
11821183
}
11831184

1184-
if (report_version < s_report_version) {
1185+
if (report_version < s_report_version || UNLIKELY(config::enable_debug_points &&
1186+
DebugPoints::instance()->is_enable("WorkPoolReportTablet.report_tablet_callback.skip"))) {
11851187
// TODO llj This can only reduce the possibility for report error, but can't avoid it.
11861188
// If FE create a tablet in FE meta and send CREATE task to this BE, the tablet may not be included in this
11871189
// report, and the report version has a small probability that it has not been updated in time. When FE
11881190
// receives this report, it is possible to delete the new tablet.
11891191
LOG(WARNING) << "report version " << report_version << " change to " << s_report_version;
11901192
DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1);
1193+
int64_t current_time = time(nullptr);
1194+
if (s_tablet_report_failure_start_time.load() == 0) {
1195+
s_tablet_report_failure_start_time.store(current_time);
1196+
}
1197+
DorisMetrics::instance()->tablet_report_continuous_failure_duration_s->set_value(
1198+
current_time - s_tablet_report_failure_start_time);
11911199
return;
11921200
}
11931201

@@ -1227,6 +1235,15 @@ void report_tablet_callback(StorageEngine& engine, const ClusterInfo* cluster_in
12271235
report_tablet_total << 1;
12281236
if (!succ) [[unlikely]] {
12291237
report_tablet_failed << 1;
1238+
int64_t current_time = time(nullptr);
1239+
if (s_tablet_report_failure_start_time.load() == 0) {
1240+
s_tablet_report_failure_start_time.store(current_time);
1241+
}
1242+
DorisMetrics::instance()->tablet_report_continuous_failure_duration_s->set_value(
1243+
current_time - s_tablet_report_failure_start_time);
1244+
} else {
1245+
s_tablet_report_failure_start_time.store(0);
1246+
DorisMetrics::instance()->tablet_report_continuous_failure_duration_s->set_value(0);
12301247
}
12311248
}
12321249

@@ -1254,9 +1271,16 @@ void report_tablet_callback(CloudStorageEngine& engine, const ClusterInfo* clust
12541271
}
12551272
}
12561273

1257-
if (report_version < s_report_version) {
1274+
if (report_version < s_report_version || UNLIKELY(config::enable_debug_points &&
1275+
DebugPoints::instance()->is_enable("WorkPoolCloudReportTablet.report_tablet_callback.skip"))) {
12581276
LOG(WARNING) << "report version " << report_version << " change to " << s_report_version;
12591277
DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1);
1278+
int64_t current_time = time(nullptr);
1279+
if (s_tablet_report_failure_start_time.load() == 0) {
1280+
s_tablet_report_failure_start_time.store(current_time);
1281+
}
1282+
DorisMetrics::instance()->tablet_report_continuous_failure_duration_s->set_value(
1283+
current_time - s_tablet_report_failure_start_time);
12601284
return;
12611285
}
12621286

@@ -1267,6 +1291,15 @@ void report_tablet_callback(CloudStorageEngine& engine, const ClusterInfo* clust
12671291
report_tablet_total << 1;
12681292
if (!succ) [[unlikely]] {
12691293
report_tablet_failed << 1;
1294+
int64_t current_time = time(nullptr);
1295+
if (s_tablet_report_failure_start_time.load() == 0) {
1296+
s_tablet_report_failure_start_time.store(current_time);
1297+
}
1298+
DorisMetrics::instance()->tablet_report_continuous_failure_duration_s->set_value(
1299+
current_time - s_tablet_report_failure_start_time);
1300+
} else {
1301+
s_tablet_report_failure_start_time.store(0);
1302+
DorisMetrics::instance()->tablet_report_continuous_failure_duration_s->set_value(0);
12701303
}
12711304
}
12721305

be/src/util/doris_metrics.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,8 @@ DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(runtime_filter_consumer_timeout_num, Met
250250
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(get_remote_tablet_slow_time_ms, MetricUnit::MILLISECONDS);
251251
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(get_remote_tablet_slow_cnt, MetricUnit::NOUNIT);
252252

253+
DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(tablet_report_continuous_failure_duration_s, MetricUnit::SECONDS);
254+
253255
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(ann_index_load_costs_ms, MetricUnit::MILLISECONDS);
254256
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(ann_index_load_cnt, MetricUnit::NOUNIT);
255257
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(ann_index_search_costs_ms, MetricUnit::MILLISECONDS);
@@ -419,6 +421,8 @@ DorisMetrics::DorisMetrics() : _metric_registry(_s_registry_name) {
419421
INT_COUNTER_METRIC_REGISTER(_server_metric_entity, get_remote_tablet_slow_time_ms);
420422
INT_COUNTER_METRIC_REGISTER(_server_metric_entity, get_remote_tablet_slow_cnt);
421423

424+
INT_GAUGE_METRIC_REGISTER(_server_metric_entity, tablet_report_continuous_failure_duration_s);
425+
422426
INT_COUNTER_METRIC_REGISTER(_server_metric_entity, pipeline_task_queue_size);
423427

424428
INT_COUNTER_METRIC_REGISTER(_server_metric_entity, ann_index_load_costs_ms);

be/src/util/doris_metrics.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ class DorisMetrics {
155155
IntGauge* tablet_cumulative_max_compaction_score = nullptr;
156156
IntGauge* tablet_base_max_compaction_score = nullptr;
157157

158+
// tablet report
159+
IntGauge* tablet_report_continuous_failure_duration_s = nullptr;
160+
158161
IntGauge* all_rowsets_num = nullptr;
159162
IntGauge* all_segments_num = nullptr;
160163

0 commit comments

Comments
 (0)