From 88a626b08b9a9a6c99751bb871fd1ece921a7c84 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 20 Sep 2024 23:46:38 +0200 Subject: [PATCH 01/13] POC: Cost Attribution --- cmd/mimir/config-descriptor.json | 22 +++++ cmd/mimir/help-all.txt.tmpl | 4 + pkg/distributor/distributor.go | 22 +++-- .../distributor_ingest_storage_test.go | 2 +- pkg/distributor/distributor_test.go | 13 ++- pkg/distributor/validate.go | 1 + .../activeseries/active_labels_test.go | 2 +- .../active_native_histogram_postings_test.go | 13 ++- .../activeseries/active_postings_test.go | 9 +- .../activeseries/active_series_test.go | 29 +++--- pkg/ingester/ingester.go | 46 +++++++-- pkg/ingester/ingester_ingest_storage_test.go | 2 +- pkg/ingester/ingester_test.go | 98 +++++++++---------- pkg/ingester/metrics.go | 4 +- pkg/mimir/mimir.go | 17 ++-- pkg/mimir/modules.go | 1 + pkg/util/validation/separate_metrics.go | 19 ++++ 17 files changed, 198 insertions(+), 106 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index a7ff4d8a459..cbfd95cb2ca 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3984,6 +3984,28 @@ "fieldType": "string", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_label", + "required": false, + "desc": "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'cost' label with cost label's value. Currently applies to the following metrics: cortex_discarded_samples_total", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "validation.cost-attribution-label", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_per_user", + "required": false, + "desc": "Maximum number of cost attributions per user. 0 to disable the limit.", + "fieldValue": null, + "fieldDefaultValue": 200, + "fieldFlag": "validation.max-cost-attribution-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "max_fetched_chunks_per_query", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 5dc7e93cca9..6bd0e395c03 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -3059,10 +3059,14 @@ Usage of ./cmd/mimir/mimir: Enable anonymous usage reporting. (default true) -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") + -validation.cost-attribution-label string + [experimental] Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'cost' label with cost label's value. Currently applies to the following metrics: cortex_discarded_samples_total -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) + -validation.max-cost-attribution-per-user int + [experimental] Maximum number of cost attributions per user. 0 to disable the limit. (default 200) -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 59d9161147b..5434f295cc4 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -355,7 +355,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove receivedSamples: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_distributor_received_samples_total", Help: "The total number of received samples, excluding rejected and deduped samples.", - }, []string{"user"}), + }, []string{"user", "attrib"}), receivedExemplars: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_distributor_received_exemplars_total", Help: "The total number of received exemplars, excluding rejected and deduped exemplars.", @@ -643,7 +643,7 @@ func (d *Distributor) cleanupInactiveUser(userID string) { d.HATracker.cleanupHATrackerMetricsForUser(userID) d.receivedRequests.DeleteLabelValues(userID) - d.receivedSamples.DeleteLabelValues(userID) + d.receivedSamples.DeletePartialMatch(prometheus.Labels{"user": userID}) d.receivedExemplars.DeleteLabelValues(userID) d.receivedMetadata.DeleteLabelValues(userID) d.incomingRequests.DeleteLabelValues(userID) @@ -1432,7 +1432,7 @@ func (d *Distributor) push(ctx context.Context, pushReq *Request) error { return err } - d.updateReceivedMetrics(req, userID) + d.updateReceivedMetrics(req, userID, d.limits.CostAttributionLabel(userID)) if len(req.Timeseries) == 0 && len(req.Metadata) == 0 { return nil @@ -1663,15 +1663,25 @@ func tokenForMetadata(userID string, metricName string) uint32 { return mimirpb.ShardByMetricName(userID, metricName) } -func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { +func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string, costAttributionLabel string) { var receivedSamples, receivedExemplars, receivedMetadata int + costAttribution := make(map[string]int) for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) + if costAttributionLabel != "" { + attribution := mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel) + costAttribution[attribution]++ + } } receivedMetadata = len(req.Metadata) - - d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) + if costAttributionLabel != "" { + for lv, count := range costAttribution { + d.receivedSamples.WithLabelValues(userID, lv).Add(float64(count)) + } + } else { + d.receivedSamples.WithLabelValues(userID, "").Add(float64(receivedSamples)) + } d.receivedExemplars.WithLabelValues(userID).Add(float64(receivedExemplars)) d.receivedMetadata.WithLabelValues(userID).Add(float64(receivedMetadata)) } diff --git a/pkg/distributor/distributor_ingest_storage_test.go b/pkg/distributor/distributor_ingest_storage_test.go index e1c7e2be21f..091d6f0225d 100644 --- a/pkg/distributor/distributor_ingest_storage_test.go +++ b/pkg/distributor/distributor_ingest_storage_test.go @@ -209,7 +209,7 @@ func TestDistributor_Push_ShouldSupportIngestStorage(t *testing.T) { # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{user="user"} 5 + cortex_distributor_received_samples_total{attrib="", user="user"} 5 # HELP cortex_distributor_metadata_in_total The total number of metadata the have come in to the distributor, including rejected. # TYPE cortex_distributor_metadata_in_total counter diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index e71fbf24762..66760053d35 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -363,8 +363,8 @@ func TestDistributor_MetricsCleanup(t *testing.T) { "cortex_distributor_label_values_with_newlines_total", } - d.receivedSamples.WithLabelValues("userA").Add(5) - d.receivedSamples.WithLabelValues("userB").Add(10) + d.receivedSamples.WithLabelValues("userA", "").Add(5) + d.receivedSamples.WithLabelValues("userB", "").Add(10) d.receivedExemplars.WithLabelValues("userA").Add(5) d.receivedExemplars.WithLabelValues("userB").Add(10) d.receivedMetadata.WithLabelValues("userA").Add(5) @@ -401,8 +401,8 @@ func TestDistributor_MetricsCleanup(t *testing.T) { # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{user="userA"} 5 - cortex_distributor_received_samples_total{user="userB"} 10 + cortex_distributor_received_samples_total{attrib="",user="userA"} 5 + cortex_distributor_received_samples_total{attrib="",user="userB"} 10 # HELP cortex_distributor_received_exemplars_total The total number of received exemplars, excluding rejected and deduped exemplars. # TYPE cortex_distributor_received_exemplars_total counter @@ -443,7 +443,7 @@ func TestDistributor_MetricsCleanup(t *testing.T) { # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{user="userB"} 10 + cortex_distributor_received_samples_total{attrib="",user="userB"} 10 # HELP cortex_distributor_received_exemplars_total The total number of received exemplars, excluding rejected and deduped exemplars. # TYPE cortex_distributor_received_exemplars_total counter @@ -6852,7 +6852,7 @@ func TestDistributor_MetricsWithRequestModifications(t *testing.T) { cortex_distributor_received_requests_total{user="%s"} %d # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{user="%s"} %d + cortex_distributor_received_samples_total{attrib="",user="%s"} %d # HELP cortex_distributor_received_exemplars_total The total number of received exemplars, excluding rejected and deduped exemplars. # TYPE cortex_distributor_received_exemplars_total counter cortex_distributor_received_exemplars_total{user="%s"} %d @@ -6953,7 +6953,6 @@ func TestDistributor_MetricsWithRequestModifications(t *testing.T) { exemplarsIn: 10, metadataIn: 10, receivedRequests: 1, - receivedSamples: 0, receivedExemplars: 0, receivedMetadata: 10}) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index db15559c70d..2e3dc472f3c 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -370,6 +370,7 @@ func removeNonASCIIChars(in string) (out string) { // The returned error may retain the provided series labels. func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelNameValidation bool) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) + if err != nil { m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index aa7f928d7dd..fe3c8e4f78a 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") memPostings := index.NewMemPostings() for i, l := range series { diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 665f5787c61..46d8da9c089 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,7 +26,8 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -62,7 +63,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -106,7 +107,8 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 @@ -145,7 +147,8 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 @@ -181,7 +184,7 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index a2345841d11..9382ed824e3 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,7 +26,8 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") + // Update each series at a different time according to its index. for i := range allStorageRefs { activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) @@ -57,7 +58,8 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") + // Update each series at a different time according to its index. for i := range allStorageRefs { activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) @@ -88,7 +90,8 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") + // Update each series at a different time according to its index. for i := range allStorageRefs { activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index cf821c5bca5..ebe7819bf93 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -38,8 +38,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") valid := c.Purge(time.Now()) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() @@ -204,7 +203,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { @@ -231,7 +230,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "") testUpdateSeries(t, c) } @@ -448,7 +447,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "") testUpdateSeries(t, c) c.Clear() @@ -489,7 +488,7 @@ func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") c.UpdateSeries(ls1, ref1, time.Now(), -1) c.UpdateSeries(ls2, ref2, time.Now(), -1) @@ -517,7 +516,7 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") for i := 0; i < len(series); i++ { c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) @@ -563,7 +562,7 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute) + c := NewActiveSeries(asm, 5*time.Minute, "foo", "") exp := len(series) - ttl expMatchingSeries := 0 @@ -596,7 +595,7 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, "foo", "") c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) c.UpdateSeries(ls2, ref2, currentTime, -1) @@ -632,7 +631,7 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "") valid := c.Purge(currentTime) assert.True(t, valid) @@ -698,7 +697,7 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "") valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -737,7 +736,7 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "") valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -790,7 +789,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0) + c = NewActiveSeries(&asmodel.Matchers{}, 0, "foo", "") updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -928,7 +927,7 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "") for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) @@ -953,7 +952,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 97a1f83011e..d794d27cbc6 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -786,9 +786,17 @@ func (i *Ingester) updateActiveSeries(now time.Time) { allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := userDB.activeSeries.ActiveWithMatchers() i.metrics.activeSeriesLoading.DeleteLabelValues(userID) if allActive > 0 { - i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(allActive)) + costAttribLabel := i.limits.CostAttributionLabel(userID) + if costAttribLabel != "" { + labelAttributions := userDB.activeSeries.ActiveByAttributionValue() + for label, count := range labelAttributions { + i.metrics.activeSeriesPerUser.WithLabelValues(userID, label).Set(float64(count)) + } + } else { + i.metrics.activeSeriesPerUser.WithLabelValues(userID, "").Set(float64(allActive)) + } } else { - i.metrics.activeSeriesPerUser.DeleteLabelValues(userID) + i.metrics.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) } if allActiveHistograms > 0 { i.metrics.activeSeriesPerUserNativeHistograms.WithLabelValues(userID).Set(float64(allActiveHistograms)) @@ -944,6 +952,7 @@ type extendedAppender interface { type pushStats struct { succeededSamplesCount int failedSamplesCount int + failedSamplesAttribution map[string]int succeededExemplarsCount int failedExemplarsCount int sampleOutOfBoundsCount int @@ -1152,7 +1161,10 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Keep track of some stats which are tracked only if the samples will be // successfully committed - stats pushStats + + stats = pushStats{ + failedSamplesAttribution: make(map[string]int), + } firstPartialErr error // updateFirstPartial is a function that, in case of a softError, stores that error @@ -1277,9 +1289,15 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.PreallocTimeseries, app extendedAppender, startAppend time.Time, stats *pushStats, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { - // Return true if handled as soft error, and we can ingest more series. handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { + + // get the cost attribution value for the series + costAttrib := validation.AttributionValue(i.limits, userID, labels) + if costAttrib != "" { + stats.failedSamplesAttribution[costAttrib]++ + } + stats.failedSamplesCount++ // Check if the error is a soft error we can proceed on. If so, we keep track @@ -1384,6 +1402,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels for _, ts := range timeseries { + // The cost attribution value for the series + costAttrib := validation.AttributionValue(i.limits, userID, ts.Labels) + // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1399,7 +1420,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleOutOfBoundsCount += len(ts.Samples) + len(ts.Histograms) - + if costAttrib != "" { + stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) + len(ts.Histograms) + } var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1420,7 +1443,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) stats.sampleOutOfBoundsCount += len(ts.Samples) - + if costAttrib != "" { + stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) + } firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -2629,8 +2654,13 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD } userDB := &userTSDB{ - userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout), + userID: userID, + activeSeries: activeseries.NewActiveSeries( + asmodel.NewMatchers(matchersConfig), + i.cfg.ActiveSeriesMetrics.IdleTimeout, + userID, + i.limits.CostAttributionLabel(userID), + ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 4a529321155..66a76c52ba0 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -193,7 +193,7 @@ func TestIngester_Start(t *testing.T) { return testutil.GatherAndCompare(reg, strings.NewReader(fmt.Sprintf(` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="%s"} 1 + cortex_ingester_active_series{attrib="",user="%s"} 1 # HELP cortex_ingester_owned_series Number of currently owned series per user. # TYPE cortex_ingester_owned_series gauge diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 5b8db4c663f..76aa085d1fa 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -455,7 +455,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 0.009 @@ -526,7 +526,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_active_native_histogram_series Number of currently active native histogram series per user. # TYPE cortex_ingester_active_native_histogram_series gauge cortex_ingester_active_native_histogram_series{user="test"} 1 @@ -606,7 +606,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -716,7 +716,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -818,7 +818,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -944,7 +944,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1071,7 +1071,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1179,7 +1179,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1300,7 +1300,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1416,7 +1416,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1544,7 +1544,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1712,7 +1712,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -2060,7 +2060,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -2204,7 +2204,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -2335,7 +2335,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-order",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 @@ -2393,7 +2393,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-bounds",user="test"} 2 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2452,7 +2452,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-bounds",user="test"} 3 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2509,7 +2509,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2571,7 +2571,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-bounds",user="test"} 2 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2633,7 +2633,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-too-far-in-future",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds ` + fmt.Sprintf("%g", float64(now.UnixMilli())/1000) + ` @@ -2688,7 +2688,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-too-far-in-future",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_active_native_histogram_buckets Number of currently active native histogram buckets per user. # TYPE cortex_ingester_active_native_histogram_buckets gauge cortex_ingester_active_native_histogram_buckets{user="test"} 8 @@ -2764,7 +2764,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested. # TYPE cortex_ingester_ingested_exemplars_total counter cortex_ingester_ingested_exemplars_total 1 @@ -2824,7 +2824,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="new-value-for-timestamp",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2969,7 +2969,7 @@ func TestIngester_Push(t *testing.T) { expectedMetrics: ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test"} 1 + cortex_ingester_active_series{attrib="",user="test"} 1 # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. # TYPE cortex_ingester_ingested_samples_total counter cortex_ingester_ingested_samples_total{user="test"} 1 @@ -3406,8 +3406,8 @@ func TestIngester_Push_ShouldCorrectlyTrackMetricsInMultiTenantScenario(t *testi cortex_ingester_memory_series_removed_total{user="test-2"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test-1"} 2 - cortex_ingester_active_series{user="test-2"} 2 + cortex_ingester_active_series{attrib="",user="test-1"} 2 + cortex_ingester_active_series{attrib="",user="test-2"} 2 # HELP cortex_ingester_active_native_histogram_series Number of currently active native histogram series per user. # TYPE cortex_ingester_active_native_histogram_series gauge cortex_ingester_active_native_histogram_series{user="test-1"} 1 @@ -7214,7 +7214,7 @@ func TestIngesterCompactAndCloseIdleTSDB(t *testing.T) { # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="1"} 1 + cortex_ingester_active_series{attrib="",user="1"} 1 # HELP cortex_ingester_memory_metadata The current number of metadata in memory. # TYPE cortex_ingester_memory_metadata gauge @@ -7276,7 +7276,7 @@ func TestIngesterCompactAndCloseIdleTSDB(t *testing.T) { # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="1"} 1 + cortex_ingester_active_series{attrib="",user="1"} 1 # HELP cortex_ingester_memory_metadata The current number of metadata in memory. # TYPE cortex_ingester_memory_metadata gauge @@ -8877,8 +8877,8 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -8924,8 +8924,8 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -8976,8 +8976,8 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -9023,7 +9023,7 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -9243,8 +9243,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="other_test_user"} 4 @@ -9291,7 +9291,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="other_test_user"} 4 @@ -9328,8 +9328,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 @@ -9377,8 +9377,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 @@ -9420,7 +9420,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 @@ -9457,8 +9457,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="other_test_user"} 4 @@ -9503,7 +9503,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="test_user"} 4 @@ -9558,7 +9558,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -9605,8 +9605,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{user="other_test_user"} 8 - cortex_ingester_active_series{user="test_user"} 8 + cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{attrib="",user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 833a802fb04..984e3aa70c3 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -303,7 +303,7 @@ func newIngesterMetrics( activeSeriesPerUser: promauto.With(activeSeriesReg).NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_ingester_active_series", Help: "Number of currently active series per user.", - }, []string{"user"}), + }, []string{"user", "attrib"}), // Not registered automatically, but only if activeSeriesEnabled is true. activeSeriesCustomTrackersPerUser: promauto.With(activeSeriesReg).NewGaugeVec(prometheus.GaugeOpts{ @@ -409,7 +409,7 @@ func (m *ingesterMetrics) deletePerGroupMetricsForUser(userID, group string) { func (m *ingesterMetrics) deletePerUserCustomTrackerMetrics(userID string, customTrackerMetrics []string) { m.activeSeriesLoading.DeleteLabelValues(userID) - m.activeSeriesPerUser.DeleteLabelValues(userID) + m.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) m.activeSeriesPerUserNativeHistograms.DeleteLabelValues(userID) m.activeNativeHistogramBucketsPerUser.DeleteLabelValues(userID) for _, name := range customTrackerMetrics { diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index a778a05ac3a..a29042b56ae 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -702,14 +702,15 @@ type Mimir struct { ServiceMap map[string]services.Service ModuleManager *modules.Manager - API *api.API - Server *server.Server - IngesterRing *ring.Ring - IngesterPartitionRingWatcher *ring.PartitionRingWatcher - IngesterPartitionInstanceRing *ring.PartitionInstanceRing - TenantLimits validation.TenantLimits - Overrides *validation.Overrides - ActiveGroupsCleanup *util.ActiveGroupsCleanupService + API *api.API + Server *server.Server + IngesterRing *ring.Ring + IngesterPartitionRingWatcher *ring.PartitionRingWatcher + IngesterPartitionInstanceRing *ring.PartitionInstanceRing + TenantLimits validation.TenantLimits + Overrides *validation.Overrides + ActiveGroupsCleanup *util.ActiveGroupsCleanupService + Distributor *distributor.Distributor Ingester *ingester.Ingester Flusher *flusher.Flusher diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index c372bea1c25..e8ce3fc4529 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -79,6 +79,7 @@ const ( OverridesExporter string = "overrides-exporter" Server string = "server" ActiveGroupsCleanupService string = "active-groups-cleanup-service" + CostAttributionCleanupService string = "cost-attribution-cleanup-service" Distributor string = "distributor" DistributorService string = "distributor-service" Ingester string = "ingester" diff --git a/pkg/util/validation/separate_metrics.go b/pkg/util/validation/separate_metrics.go index cc6194a42e4..48cefb65051 100644 --- a/pkg/util/validation/separate_metrics.go +++ b/pkg/util/validation/separate_metrics.go @@ -30,3 +30,22 @@ func GroupLabel(o *Overrides, userID string, timeseries []mimirpb.PreallocTimese return "" } + +// AttributionLabel obtains the value of cost attribution label for tenant +func AttributionValue(o *Overrides, userID string, lbs []mimirpb.LabelAdapter) string { + if len(lbs) == 0 { + return "" + } + + attributionLabel := o.CostAttributionLabel(userID) + if attributionLabel == "" { + // If not set, no cost attribution is required + return attributionLabel + } + for _, label := range lbs { + if label.Name == attributionLabel { + return label.Value + } + } + return "" +} From 1107f71b52e91ed0d8f6e8b911273454a7960eb7 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Sep 2024 00:33:07 +0200 Subject: [PATCH 02/13] new service for tracking cost attribution --- cmd/mimir/config-descriptor.json | 33 ++-- cmd/mimir/help-all.txt.tmpl | 6 +- pkg/distributor/distributor.go | 18 +- pkg/distributor/distributor_test.go | 4 +- pkg/distributor/validate.go | 1 - .../active_native_histogram_postings_test.go | 20 ++ .../activeseries/active_postings_test.go | 2 +- pkg/ingester/activeseries/active_series.go | 133 +++++++++---- .../activeseries/active_series_test.go | 28 +-- pkg/ingester/ingester.go | 37 +++- pkg/ingester/ingester_ingest_storage_test.go | 2 +- pkg/ingester/ingester_test.go | 6 +- pkg/ingester/metrics.go | 12 ++ pkg/mimir/mimir.go | 5 + pkg/mimir/modules.go | 25 ++- .../benchmarks/comparison_test.go | 2 +- pkg/streamingpromql/benchmarks/ingester.go | 2 +- pkg/util/cost_attribution.go | 185 ++++++++++++++++++ pkg/util/validation/limits.go | 10 + pkg/util/validation/separate_metrics.go | 9 +- 20 files changed, 447 insertions(+), 93 deletions(-) create mode 100644 pkg/util/cost_attribution.go diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index cbfd95cb2ca..2166f244176 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -57,6 +57,28 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "max_cost_attribution_per_user", + "required": false, + "desc": "Maximum number of cost attributions allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 200, + "fieldFlag": "max-cost-attribution-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_eviction_interval", + "required": false, + "desc": "Interval at which to evict inactive cost attributions.", + "fieldValue": null, + "fieldDefaultValue": 600000000000, + "fieldFlag": "cost-attribution-eviction-interval", + "fieldType": "duration", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "enable_go_runtime_metrics", @@ -3995,17 +4017,6 @@ "fieldType": "string", "fieldCategory": "experimental" }, - { - "kind": "field", - "name": "max_cost_attribution_per_user", - "required": false, - "desc": "Maximum number of cost attributions per user. 0 to disable the limit.", - "fieldValue": null, - "fieldDefaultValue": 200, - "fieldFlag": "validation.max-cost-attribution-per-user", - "fieldType": "int", - "fieldCategory": "experimental" - }, { "kind": "field", "name": "max_fetched_chunks_per_query", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 6bd0e395c03..9e191292ae2 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1133,6 +1133,8 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution-eviction-interval duration + [experimental] Interval at which to evict inactive cost attributions. (default 10m0s) -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int @@ -1677,6 +1679,8 @@ Usage of ./cmd/mimir/mimir: [experimental] Burst size, i.e., maximum number of messages that can be logged at once, temporarily exceeding the configured maximum logs per second. (default 1000) -log.rate-limit-logs-per-second float [experimental] Maximum number of messages per second to be logged. (default 10000) + -max-cost-attribution-per-user int + [experimental] Maximum number of cost attributions allowed per user. (default 200) -max-separate-metrics-groups-per-user int [experimental] Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated. (default 1000) -mem-ballast-size-bytes int @@ -3065,8 +3069,6 @@ Usage of ./cmd/mimir/mimir: Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) - -validation.max-cost-attribution-per-user int - [experimental] Maximum number of cost attributions per user. 0 to disable the limit. (default 200) -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 5434f295cc4..bcd2a5f3b1e 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -105,7 +105,7 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - + costAttributionsvr *util.CostAttributionCleanupService // For handling HA replicas. HATracker *haTracker @@ -306,7 +306,10 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, + activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionClenaupService *util.CostAttributionCleanupService, + ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, + canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -341,6 +344,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove healthyInstancesCount: atomic.NewUint32(0), limits: limits, HATracker: haTracker, + costAttributionsvr: costAttributionClenaupService, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -356,6 +360,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove Name: "cortex_distributor_received_samples_total", Help: "The total number of received samples, excluding rejected and deduped samples.", }, []string{"user", "attrib"}), + receivedExemplars: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_distributor_received_exemplars_total", Help: "The total number of received exemplars, excluding rejected and deduped exemplars.", @@ -643,7 +648,6 @@ func (d *Distributor) cleanupInactiveUser(userID string) { d.HATracker.cleanupHATrackerMetricsForUser(userID) d.receivedRequests.DeleteLabelValues(userID) - d.receivedSamples.DeletePartialMatch(prometheus.Labels{"user": userID}) d.receivedExemplars.DeleteLabelValues(userID) d.receivedMetadata.DeleteLabelValues(userID) d.incomingRequests.DeleteLabelValues(userID) @@ -660,6 +664,7 @@ func (d *Distributor) cleanupInactiveUser(userID string) { filter := prometheus.Labels{"user": userID} d.dedupedSamples.DeletePartialMatch(filter) + d.receivedSamples.DeletePartialMatch(filter) d.discardedSamplesTooManyHaClusters.DeletePartialMatch(filter) d.discardedSamplesRateLimited.DeletePartialMatch(filter) d.discardedRequestsRateLimited.DeleteLabelValues(userID) @@ -678,6 +683,11 @@ func (d *Distributor) RemoveGroupMetricsForUser(userID, group string) { d.sampleValidationMetrics.deleteUserMetricsForGroup(userID, group) } +func (d *Distributor) RemoveAttributionMetricsForUser(userID, attribution string) { + d.receivedSamples.DeleteLabelValues(userID, attribution) + //TODO @ying: Remove attribution metrics +} + // Called after distributor is asked to stop via StopAsync. func (d *Distributor) stopping(_ error) error { return services.StopManagerAndAwaitStopped(context.Background(), d.subservices) @@ -1670,7 +1680,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) if costAttributionLabel != "" { - attribution := mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel) + attribution := d.costAttributionsvr.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel), mtime.Now()) costAttribution[attribution]++ } } diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 66760053d35..1e28bc7a949 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -2035,7 +2035,7 @@ func BenchmarkDistributor_Push(b *testing.B) { require.NoError(b, err) // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(b, err) require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) @@ -5323,7 +5323,7 @@ func prepare(t testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []* require.NoError(t, err) reg := prometheus.NewPedanticRegistry() - d, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) + d, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, d)) t.Cleanup(func() { diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 2e3dc472f3c..db15559c70d 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -370,7 +370,6 @@ func removeNonASCIIChars(in string) (out string) { // The returned error may retain the provided series labels. func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelNameValidation bool) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) - if err != nil { m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 46d8da9c089..85f5272d385 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,8 +26,12 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) +>>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -63,7 +67,11 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) +>>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -107,7 +115,11 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) +>>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -147,7 +159,11 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) +>>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -184,7 +200,11 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) +>>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index 9382ed824e3..c619e043eb1 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -58,7 +58,7 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "") // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 71044b5e348..925cb191a95 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -11,13 +11,12 @@ import ( "sync" "time" + "github.com/grafana/mimir/pkg/util" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" - - asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) const ( @@ -46,25 +45,31 @@ type ActiveSeries struct { // matchersMutex protects matchers and lastMatchersUpdate. matchersMutex sync.RWMutex - matchers *asmodel.Matchers + matchers *Matchers lastMatchersUpdate time.Time + costAttributionLabel string + costAttributionSvc *util.CostAttributionCleanupService + maxCostAttributionPerUser int + // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. timeout time.Duration + userID string } // seriesStripe holds a subset of the series timestamps for a single tenant. type seriesStripe struct { - matchers *asmodel.Matchers + matchers *Matchers deleted *deletedSeries // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 - + oldestEntryTs atomic.Int64 + costAttributionSvc *util.CostAttributionCleanupService + maxCostAttributionPerUser int mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -73,23 +78,41 @@ type seriesStripe struct { activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. + userID string + costAttributionLabel string + // here the attribution values map, it maps the attribute value to its index, so we can increment the counter directly, + // so in each entry, we keep the index of the value only, instead of keeping the string value + costAttributionValues map[string]uint32 } // seriesEntry holds a timestamp for single series. type seriesEntry struct { - nanos *atomic.Int64 // Unix timestamp in nanoseconds. Needs to be a pointer because we don't store pointers to entries in the stripe. - matches asmodel.PreAllocDynamicSlice // Index of the matcher matching - numNativeHistogramBuckets int // Number of buckets in native histogram series, -1 if not a native histogram. - - deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. -} - -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout} + nanos *atomic.Int64 // Unix timestamp in nanoseconds. Needs to be a pointer because we don't store pointers to entries in the stripe. + matches preAllocDynamicSlice // Index of the matcher matching + numNativeHistogramBuckets int // Number of buckets in native histogram series, -1 if not a native histogram. + // keep the value corresponding the label configured in serieStripe + deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. + attributionValue string +} + +func NewActiveSeries( + asm *Matchers, + timeout time.Duration, + userID string, + costAttributionLabel string, + costAttributionSvc *util.CostAttributionCleanupService, + maxCostAttributionPerUser int, +) *ActiveSeries { + c := &ActiveSeries{ + matchers: asm, timeout: timeout, userID: userID, + costAttributionLabel: costAttributionLabel, + costAttributionSvc: costAttributionSvc, + maxCostAttributionPerUser: maxCostAttributionPerUser, + } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionLabel, costAttributionSvc, maxCostAttributionPerUser) } return c @@ -101,18 +124,18 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { return c.matchers.MatcherNames() } -func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { +func (c *ActiveSeries) ReloadMatchers(asm *Matchers, now time.Time) { c.matchersMutex.Lock() defer c.matchersMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionLabel, c.costAttributionSvc, c.maxCostAttributionPerUser) } c.matchers = asm c.lastMatchersUpdate = now } -func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { +func (c *ActiveSeries) CurrentConfig() CustomTrackersConfig { c.matchersMutex.RLock() defer c.matchersMutex.RUnlock() return c.matchers.Config() @@ -212,6 +235,18 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } +func (c *ActiveSeries) ActiveByAttributionValue() map[string]uint32 { + total := make(map[string]uint32, c.maxCostAttributionPerUser) + for s := 0; s < numStripes; s++ { + c.stripes[s].mu.RLock() + for k, v := range c.stripes[s].costAttributionValues { + total[k] += v + } + c.stripes[s].mu.RUnlock() + } + return total +} + func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { stripeID := storage.SeriesRef(ref) % numStripes c.stripes[stripeID].remove(storage.SeriesRef(ref)) @@ -337,21 +372,21 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef entry, ok := s.refs[ref] if ok { if entry.numNativeHistogramBuckets != numNativeHistogramBuckets { - matches := s.matchers.Matches(series) - matchesLen := matches.Len() + matches := s.matchers.matches(series) + matchesLen := matches.len() if numNativeHistogramBuckets >= 0 && entry.numNativeHistogramBuckets >= 0 { // change number of buckets but still a histogram diff := numNativeHistogramBuckets - entry.numNativeHistogramBuckets s.activeNativeHistogramBuckets = uint32(int(s.activeNativeHistogramBuckets) + diff) for i := 0; i < matchesLen; i++ { - s.activeMatchingNativeHistogramBuckets[matches.Get(i)] = uint32(int(s.activeMatchingNativeHistogramBuckets[matches.Get(i)]) + diff) + s.activeMatchingNativeHistogramBuckets[matches.get(i)] = uint32(int(s.activeMatchingNativeHistogramBuckets[matches.get(i)]) + diff) } } else if numNativeHistogramBuckets >= 0 { // change from float to histogram s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(numNativeHistogramBuckets) for i := 0; i < matchesLen; i++ { - match := matches.Get(i) + match := matches.get(i) s.activeMatchingNativeHistograms[match]++ s.activeMatchingNativeHistogramBuckets[match] += uint32(numNativeHistogramBuckets) } @@ -360,7 +395,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) for i := 0; i < matchesLen; i++ { - match := matches.Get(i) + match := matches.get(i) s.activeMatchingNativeHistograms[match]-- s.activeMatchingNativeHistogramBuckets[match] -= uint32(entry.numNativeHistogramBuckets) } @@ -371,16 +406,17 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef return entry.nanos, false } - matches := s.matchers.Matches(series) - matchesLen := matches.Len() + matches := s.matchers.matches(series) + matchesLen := matches.len() s.active++ + if numNativeHistogramBuckets >= 0 { s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(numNativeHistogramBuckets) } for i := 0; i < matchesLen; i++ { - match := matches.Get(i) + match := matches.get(i) s.activeMatching[match]++ if numNativeHistogramBuckets >= 0 { s.activeMatchingNativeHistograms[match]++ @@ -394,6 +430,14 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } + // here if we have a cost attribution label, we can split the serie count based on the value of the label + // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly + if s.costAttributionLabel != "" { + attributionValue := s.costAttributionSvc.UpdateAttributionTimestamp(s.userID, series.Get(s.costAttributionLabel), time.Unix(0, nowNanos), s.maxCostAttributionPerUser) + s.costAttributionValues[attributionValue]++ + e.attributionValue = attributionValue + } + s.refs[ref] = e return e.nanos, true } @@ -405,8 +449,10 @@ func (s *seriesStripe) clear() { s.oldestEntryTs.Store(0) s.refs = map[storage.SeriesRef]seriesEntry{} s.active = 0 + s.costAttributionValues = map[string]uint32{} s.activeNativeHistograms = 0 s.activeNativeHistogramBuckets = 0 + for i := range s.activeMatching { s.activeMatching[i] = 0 s.activeMatchingNativeHistograms[i] = 0 @@ -415,20 +461,31 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries) { +func (s *seriesStripe) reinitialize( + asm *Matchers, + deleted *deletedSeries, + userID string, + costAttributionLabel string, + costAttributionSvc *util.CostAttributionCleanupService, + maxCostAttributionPerUser int, +) { s.mu.Lock() defer s.mu.Unlock() - s.deleted = deleted s.oldestEntryTs.Store(0) s.refs = map[storage.SeriesRef]seriesEntry{} s.active = 0 + s.costAttributionValues = map[string]uint32{} s.activeNativeHistograms = 0 s.activeNativeHistogramBuckets = 0 + s.maxCostAttributionPerUser = maxCostAttributionPerUser s.matchers = asm + s.userID = userID s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) + s.costAttributionLabel = costAttributionLabel + s.costAttributionSvc = costAttributionSvc } func (s *seriesStripe) purge(keepUntil time.Time) { @@ -442,6 +499,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { defer s.mu.Unlock() s.active = 0 + s.costAttributionValues = map[string]uint32{} s.activeNativeHistograms = 0 s.activeNativeHistogramBuckets = 0 s.activeMatching = resizeAndClear(len(s.activeMatching), s.activeMatching) @@ -456,6 +514,8 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.deleted.purge(ref) } delete(s.refs, ref) + // here need to find what is deleted and decrement counters + continue } @@ -464,9 +524,13 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(entry.numNativeHistogramBuckets) } - ml := entry.matches.Len() + // restore the cost attribution by attribution value + if entry.attributionValue != "" { + s.costAttributionValues[entry.attributionValue]++ + } + ml := entry.matches.len() for i := 0; i < ml; i++ { - match := entry.matches.Get(i) + match := entry.matches.get(i) s.activeMatching[match]++ if entry.numNativeHistogramBuckets >= 0 { s.activeMatchingNativeHistograms[match]++ @@ -502,13 +566,16 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { } s.active-- + if entry.attributionValue != "" { + s.costAttributionValues[entry.attributionValue]-- + } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) } - ml := entry.matches.Len() + ml := entry.matches.len() for i := 0; i < ml; i++ { - match := entry.matches.Get(i) + match := entry.matches.get(i) s.activeMatching[match]-- if entry.numNativeHistogramBuckets >= 0 { s.activeMatchingNativeHistograms[match]-- diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index ebe7819bf93..df5fe0f0ecf 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -38,7 +38,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") valid := c.Purge(time.Now()) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() @@ -203,7 +203,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { @@ -229,7 +229,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { } func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { - asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) + asm := NewMatchers(mustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) c := NewActiveSeries(asm, DefaultTimeout, "foo", "") testUpdateSeries(t, c) } @@ -446,7 +446,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { } func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { - asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) + asm := NewMatchers(mustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) c := NewActiveSeries(asm, DefaultTimeout, "foo", "") testUpdateSeries(t, c) @@ -488,7 +488,7 @@ func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") c.UpdateSeries(ls1, ref1, time.Now(), -1) c.UpdateSeries(ls2, ref2, time.Now(), -1) @@ -516,7 +516,7 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") for i := 0; i < len(series); i++ { c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) @@ -562,7 +562,7 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute, "foo", "") + c := NewActiveSeries(asm, 5*time.Minute, "foo", "", nil) exp := len(series) - ttl expMatchingSeries := 0 @@ -595,7 +595,7 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, "foo", "") + c := NewActiveSeries(&Matchers{}, 59*time.Second, "foo", "") c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) c.UpdateSeries(ls2, ref2, currentTime, -1) @@ -631,7 +631,7 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "") + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) valid := c.Purge(currentTime) assert.True(t, valid) @@ -697,7 +697,7 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "") + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -736,7 +736,7 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "") + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -789,7 +789,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0, "foo", "") + c = NewActiveSeries(&Matchers{}, 0, "foo", "") updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -927,7 +927,7 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout, "foo", "") + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) @@ -952,7 +952,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "") + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index d794d27cbc6..04a61275fb1 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -310,6 +310,8 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService + costAttribution *util.CostAttributionCleanupService + tsdbMetrics *tsdbMetrics forceCompactTrigger chan requestWithUsersAndCallback @@ -378,7 +380,12 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New( + cfg Config, limits *validation.Overrides, + ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, + activeGroupsCleanupService *util.ActiveGroupsCleanupService, + costAttributionCleanupService *util.CostAttributionCleanupService, + registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -386,7 +393,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.ingestionRate = util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval) i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService - + i.costAttribution = costAttributionCleanupService // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -1281,6 +1288,11 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats db.ingestedAPISamples.Add(int64(stats.succeededSamplesCount)) } } + if stats.failedSamplesAttribution != nil && len(stats.failedSamplesAttribution) > 0 { + for label, count := range stats.failedSamplesAttribution { + discarded.samplesPerAttribution.WithLabelValues(userID, label).Add(float64(count)) + } + } } // pushSamplesToAppender appends samples and exemplars to the appender. Most errors are handled via updateFirstPartial function, @@ -1293,8 +1305,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { // get the cost attribution value for the series - costAttrib := validation.AttributionValue(i.limits, userID, labels) - if costAttrib != "" { + costLabel := i.limits.CostAttributionLabel(userID) + if costLabel != "" { + costAttrib := i.costAttribution.UpdateAttributionTimestamp(userID, validation.AttributionValue(costLabel, userID, labels), time.Now()) stats.failedSamplesAttribution[costAttrib]++ } @@ -1403,7 +1416,12 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var nonCopiedLabels labels.Labels for _, ts := range timeseries { // The cost attribution value for the series - costAttrib := validation.AttributionValue(i.limits, userID, ts.Labels) + costLabel := i.limits.CostAttributionLabel(userID) + var costAttrib string + // when cost attribution label is set + if costLabel != "" { + costAttrib = i.costAttribution.UpdateAttributionTimestamp(userID, validation.AttributionValue(costLabel, userID, ts.Labels), time.Now()) + } // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1420,7 +1438,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleOutOfBoundsCount += len(ts.Samples) + len(ts.Histograms) - if costAttrib != "" { + if costLabel != "" { stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) + len(ts.Histograms) } var firstTimestamp int64 @@ -1443,7 +1461,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) stats.sampleOutOfBoundsCount += len(ts.Samples) - if costAttrib != "" { + if costLabel != "" { stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) } firstTimestamp := ts.Samples[0].TimestampMs @@ -2660,6 +2678,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD i.cfg.ActiveSeriesMetrics.IdleTimeout, userID, i.limits.CostAttributionLabel(userID), + i.costAttribution, ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -3419,6 +3438,10 @@ func (i *Ingester) RemoveGroupMetricsForUser(userID, group string) { i.metrics.deletePerGroupMetricsForUser(userID, group) } +func (i *Ingester) RemoveAttributionMetricsForUser(userID, attribution string) { + i.metrics.deletePerAttributionMetricsForUser(userID, attribution) +} + // TransferOut implements ring.FlushTransferer. func (i *Ingester) TransferOut(_ context.Context) error { return ring.ErrTransferDisabled diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 66a76c52ba0..0b566b03c0b 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -650,7 +650,7 @@ func createTestIngesterWithIngestStorage(t testing.TB, ingesterCfg *Config, over require.NoError(t, services.StopAndAwaitTerminated(ctx, prw)) }) - ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, reg, util_test.NewTestingLogger(t)) + ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, nil, reg, util_test.NewTestingLogger(t)) require.NoError(t, err) return ingester, kafkaCluster, prw diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 76aa085d1fa..1581ac9d05b 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -5997,7 +5997,7 @@ func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, i ingestersRing = createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()) } - ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) + ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) if err != nil { return nil, err } @@ -6203,7 +6203,7 @@ func TestIngester_OpenExistingTSDBOnStartup(t *testing.T) { // setup the tsdbs dir testData.setup(t, tempDir) - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) startErr := services.StartAndAwaitRunning(context.Background(), ingester) @@ -7363,7 +7363,7 @@ func TestHeadCompactionOnStartup(t *testing.T) { ingesterCfg.BlocksStorageConfig.Bucket.S3.Endpoint = "localhost" ingesterCfg.BlocksStorageConfig.TSDB.Retention = 2 * 24 * time.Hour // Make sure that no newly created blocks are deleted. - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ingester)) diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 984e3aa70c3..54adc381c70 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -407,9 +407,15 @@ func (m *ingesterMetrics) deletePerGroupMetricsForUser(userID, group string) { m.discarded.DeleteLabelValues(userID, group) } +func (m *ingesterMetrics) deletePerAttributionMetricsForUser(userID, attribution string) { + m.activeSeriesPerUser.DeleteLabelValues(userID, attribution) + m.discarded.samplesPerAttribution.DeleteLabelValues(userID, attribution) +} + func (m *ingesterMetrics) deletePerUserCustomTrackerMetrics(userID string, customTrackerMetrics []string) { m.activeSeriesLoading.DeleteLabelValues(userID) m.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) + m.activeSeriesPerUserNativeHistograms.DeleteLabelValues(userID) m.activeNativeHistogramBucketsPerUser.DeleteLabelValues(userID) for _, name := range customTrackerMetrics { @@ -428,6 +434,7 @@ type discardedMetrics struct { perUserSeriesLimit *prometheus.CounterVec perMetricSeriesLimit *prometheus.CounterVec invalidNativeHistogram *prometheus.CounterVec + samplesPerAttribution *prometheus.CounterVec } func newDiscardedMetrics(r prometheus.Registerer) *discardedMetrics { @@ -440,6 +447,10 @@ func newDiscardedMetrics(r prometheus.Registerer) *discardedMetrics { perUserSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerUserSeriesLimit), perMetricSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerMetricSeriesLimit), invalidNativeHistogram: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogram), + samplesPerAttribution: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_discarded_samples_attribution_total", + Help: "The total number of samples that were discarded per attribution.", + }, []string{"user", "attrib"}), } } @@ -452,6 +463,7 @@ func (m *discardedMetrics) DeletePartialMatch(filter prometheus.Labels) { m.perUserSeriesLimit.DeletePartialMatch(filter) m.perMetricSeriesLimit.DeletePartialMatch(filter) m.invalidNativeHistogram.DeletePartialMatch(filter) + m.samplesPerAttribution.DeletePartialMatch(filter) } func (m *discardedMetrics) DeleteLabelValues(userID string, group string) { diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index a29042b56ae..a2e3710300b 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -109,6 +109,8 @@ type Config struct { NoAuthTenant string `yaml:"no_auth_tenant" category:"advanced"` ShutdownDelay time.Duration `yaml:"shutdown_delay" category:"advanced"` MaxSeparateMetricsGroupsPerUser int `yaml:"max_separate_metrics_groups_per_user" category:"experimental"` + MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" category:"experimental"` + CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` EnableGoRuntimeMetrics bool `yaml:"enable_go_runtime_metrics" category:"advanced"` PrintConfig bool `yaml:"-"` ApplicationName string `yaml:"-"` @@ -168,7 +170,9 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.StringVar(&c.NoAuthTenant, "auth.no-auth-tenant", "anonymous", "Tenant ID to use when multitenancy is disabled.") f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.") f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Mimir will report not-ready status via /ready endpoint.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 10*time.Minute, "Interval at which to evict inactive cost attributions.") f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") + f.IntVar(&c.MaxCostAttributionPerUser, "max-cost-attribution-per-user", 200, "Maximum number of cost attributions allowed per user.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") @@ -710,6 +714,7 @@ type Mimir struct { TenantLimits validation.TenantLimits Overrides *validation.Overrides ActiveGroupsCleanup *util.ActiveGroupsCleanupService + CostAttributionCleanup *util.CostAttributionCleanupService Distributor *distributor.Distributor Ingester *ingester.Ingester diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index e8ce3fc4529..7cdb50d8c64 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -79,7 +79,7 @@ const ( OverridesExporter string = "overrides-exporter" Server string = "server" ActiveGroupsCleanupService string = "active-groups-cleanup-service" - CostAttributionCleanupService string = "cost-attribution-cleanup-service" + CostAttributionService string = "cost-attribution-service" Distributor string = "distributor" DistributorService string = "distributor-service" Ingester string = "ingester" @@ -461,7 +461,9 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.Cfg.Distributor.PreferAvailabilityZone = t.Cfg.Querier.PreferAvailabilityZone t.Cfg.Distributor.IngestStorageConfig = t.Cfg.IngestStorage - t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, t.ActiveGroupsCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, canJoinDistributorsRing, t.Registerer, util_log.Logger) + t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, + t.ActiveGroupsCleanup, t.CostAttributionCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, + canJoinDistributorsRing, t.Registerer, util_log.Logger) if err != nil { return } @@ -470,6 +472,10 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.ActiveGroupsCleanup.Register(t.Distributor) } + if t.CostAttributionCleanup != nil { + t.CostAttributionCleanup.Register(t.Distributor) + } + return t.Distributor, nil } @@ -643,6 +649,11 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { return t.ActiveGroupsCleanup, nil } +func (t *Mimir) initCostAttributionService() (services.Service, error) { + t.CostAttributionCleanup = util.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, t.Cfg.MaxCostAttributionPerUser, util_log.Logger) + return t.CostAttributionCleanup, nil +} + func (t *Mimir) tsdbIngesterConfig() { t.Cfg.Ingester.BlocksStorageConfig = t.Cfg.BlocksStorage } @@ -654,7 +665,7 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.IngestStorageConfig = t.Cfg.IngestStorage t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.Registerer, util_log.Logger) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionCleanup, t.Registerer, util_log.Logger) if err != nil { return } @@ -663,6 +674,9 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.ActiveGroupsCleanup.Register(t.Ingester) } + if t.CostAttributionCleanup != nil { + t.CostAttributionCleanup.Register(t.Ingester) + } return t.Ingester, nil } @@ -1126,6 +1140,7 @@ func (t *Mimir) setupModuleManager() error { mm.RegisterModule(Overrides, t.initOverrides, modules.UserInvisibleModule) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(ActiveGroupsCleanupService, t.initActiveGroupsCleanupService, modules.UserInvisibleModule) + mm.RegisterModule(CostAttributionService, t.initCostAttributionService, modules.UserInvisibleModule) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) mm.RegisterModule(Ingester, t.initIngester) @@ -1164,9 +1179,9 @@ func (t *Mimir) setupModuleManager() error { IngesterPartitionRing: {MemberlistKV, IngesterRing, API}, Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, - Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, + Distributor: {DistributorService, API, ActiveGroupsCleanupService, CostAttributionService, Vault}, DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault}, - Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, + Ingester: {IngesterService, API, ActiveGroupsCleanupService, CostAttributionService, Vault}, IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, diff --git a/pkg/streamingpromql/benchmarks/comparison_test.go b/pkg/streamingpromql/benchmarks/comparison_test.go index 2678f0d5c46..1ec18d108d3 100644 --- a/pkg/streamingpromql/benchmarks/comparison_test.go +++ b/pkg/streamingpromql/benchmarks/comparison_test.go @@ -237,7 +237,7 @@ func createIngesterQueryable(t testing.TB, address string) storage.Queryable { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, ingestersRing, nil, false, nil, logger) + d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, nil, ingestersRing, nil, false, nil, logger) require.NoError(t, err) queryMetrics := stats.NewQueryMetrics(nil) diff --git a/pkg/streamingpromql/benchmarks/ingester.go b/pkg/streamingpromql/benchmarks/ingester.go index 6f3b5f04a9a..9107b66f64f 100644 --- a/pkg/streamingpromql/benchmarks/ingester.go +++ b/pkg/streamingpromql/benchmarks/ingester.go @@ -96,7 +96,7 @@ func startBenchmarkIngester(rootDataDir string) (*ingester.Ingester, string, fun return services.StopAndAwaitTerminated(context.Background(), ingestersRing) }) - ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, log.NewNopLogger()) + ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, nil, log.NewNopLogger()) if err != nil { cleanup() return nil, "", nil, fmt.Errorf("could not create ingester: %w", err) diff --git a/pkg/util/cost_attribution.go b/pkg/util/cost_attribution.go new file mode 100644 index 00000000000..04d2a2ec7b6 --- /dev/null +++ b/pkg/util/cost_attribution.go @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package util + +import ( + "context" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/grafana/dskit/services" + "go.uber.org/atomic" +) + +type CostAttribution struct { + mu sync.RWMutex + timestampsPerUser map[string]map[string]*atomic.Int64 // map[user][group] -> timestamp + coolDownDeadline map[string]*atomic.Int64 + maxAttributionPerUser int +} + +func NewCostAttribution(maxAttributionPerUser int) *CostAttribution { + return &CostAttribution{ + timestampsPerUser: map[string]map[string]*atomic.Int64{}, + coolDownDeadline: map[string]*atomic.Int64{}, + maxAttributionPerUser: maxAttributionPerUser, + } +} + +// UpdateAttributionTimestampForUser function is only guaranteed to update to the +// timestamp provided even if it is smaller than the existing value +func (ag *CostAttribution) UpdateAttributionTimestampForUser(userID, attribution string, now time.Time) { + ts := now.UnixNano() + ag.mu.RLock() + if groupTs := ag.timestampsPerUser[userID][attribution]; groupTs != nil { + ag.mu.RUnlock() + groupTs.Store(ts) + return + } + ag.mu.RUnlock() + + ag.mu.Lock() + defer ag.mu.Unlock() + + if ag.timestampsPerUser[userID] == nil { + ag.timestampsPerUser[userID] = map[string]*atomic.Int64{attribution: atomic.NewInt64(ts)} + return + } + + if groupTs := ag.timestampsPerUser[userID][attribution]; groupTs != nil { + groupTs.Store(ts) + return + } + + ag.timestampsPerUser[userID][attribution] = atomic.NewInt64(ts) +} + +func (ag *CostAttribution) purgeInactiveAttributionsForUser(userID string, deadline int64) []string { + ag.mu.RLock() + var inactiveAttributions []string + attributionTimestamps := ag.timestampsPerUser[userID] + + for attr, ts := range attributionTimestamps { + if ts.Load() <= deadline { + inactiveAttributions = append(inactiveAttributions, attr) + } + } + ag.mu.RUnlock() + + if len(inactiveAttributions) == 0 { + return nil + } + + // Cleanup inactive groups + ag.mu.Lock() + defer ag.mu.Unlock() + + for i := 0; i < len(inactiveAttributions); { + inactiveAttribution := inactiveAttributions[i] + groupTs := ag.timestampsPerUser[userID][inactiveAttribution] + if groupTs != nil && groupTs.Load() <= deadline { + delete(ag.timestampsPerUser[userID], inactiveAttribution) + i++ + } else { + inactiveAttributions[i] = inactiveAttributions[len(inactiveAttributions)-1] + inactiveAttributions = inactiveAttributions[:len(inactiveAttributions)-1] + } + } + + return inactiveAttributions +} + +func (ca *CostAttribution) purgeInactiveAttributions(inactiveTimeout time.Duration, cleanupFuncs ...func(string, string)) { + ca.mu.RLock() + userIDs := make([]string, 0, len(ca.timestampsPerUser)) + for userID := range ca.timestampsPerUser { + userIDs = append(userIDs, userID) + } + ca.mu.RUnlock() + + currentTime := time.Now() + for _, userID := range userIDs { + inactiveAttributions := ca.purgeInactiveAttributionsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano()) + for _, attribution := range inactiveAttributions { + for _, cleanupFn := range cleanupFuncs { + cleanupFn(userID, attribution) + } + } + } +} + +func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, now time.Time) bool { + // if we are still at the cooldown period, we will consider the limit reached + ca.mu.RLock() + defer ca.mu.RUnlock() + + if v, exists := ca.coolDownDeadline[userID]; exists && v.Load() > now.UnixNano() { + return true + } + + // if the user attribution is already exist and we are not in the cooldown period, we don't need to check the limit + _, exists := ca.timestampsPerUser[userID][attribution] + if exists { + return false + } + + // if the user has reached the limit, we will set the cooldown period which is 20 minutes + maxReached := len(ca.timestampsPerUser[userID]) >= ca.maxAttributionPerUser + if maxReached { + ca.coolDownDeadline[userID].Store(time.Now().Add(20 * time.Minute).UnixNano()) + return true + } + + return maxReached +} + +type CostAttributionCleanupService struct { + services.Service + logger log.Logger + costAttribution *CostAttribution + cleanupFuncs []func(userID, attribution string) + inactiveTimeout time.Duration +} + +type CostAttributionMetricsCleaner interface { + RemoveAttributionMetricsForUser(userID, attribution string) +} + +func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, maxAttributionPerUser int, logger log.Logger, cleanupFns ...func(string, string)) *CostAttributionCleanupService { + s := &CostAttributionCleanupService{ + costAttribution: NewCostAttribution(maxAttributionPerUser), + cleanupFuncs: cleanupFns, + inactiveTimeout: inactiveTimeout, + logger: logger, + } + + s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution cleanup") + return s +} + +func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user, attribution string, now time.Time) string { + // empty label is not normal, if user set attribution label, the metrics send has to include the label + if attribution == "" { + attribution = "other" + level.Error(s.logger).Log("msg", "set attribution label to \"other\" since missing cost attribution label in metrics") + } else if s.costAttribution.attributionLimitExceeded(user, attribution, now) { + attribution = "other" + level.Error(s.logger).Log("msg", "set attribution label to \"other\" since user has reached the limit of cost attribution labels") + } + + s.costAttribution.UpdateAttributionTimestampForUser(user, attribution, now) + return attribution +} + +func (s *CostAttributionCleanupService) iteration(_ context.Context) error { + s.costAttribution.purgeInactiveAttributions(s.inactiveTimeout, s.cleanupFuncs...) + return nil +} + +// Register registers the cleanup function from metricsCleaner to be called during each cleanup iteration. +// This function is NOT thread safe +func (s *CostAttributionCleanupService) Register(metricsCleaner CostAttributionMetricsCleaner) { + s.cleanupFuncs = append(s.cleanupFuncs, metricsCleaner.RemoveAttributionMetricsForUser) +} diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 463f4e81f62..d49b7d6b40d 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -143,6 +143,12 @@ type Limits struct { // User defined label to give the option of subdividing specific metrics by another label SeparateMetricsGroupLabel string `yaml:"separate_metrics_group_label" json:"separate_metrics_group_label" category:"experimental"` +<<<<<<< HEAD +======= + // User defined label to give the cost distribution by values of the label + CostAttributionLabel string `yaml:"cost_attribution_label" json:"cost_attribution_label" category:"experimental"` + +>>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Querier enforced limits. MaxChunksPerQuery int `yaml:"max_fetched_chunks_per_query" json:"max_fetched_chunks_per_query"` MaxEstimatedChunksPerQueryMultiplier float64 `yaml:"max_estimated_fetched_chunks_per_query_multiplier" json:"max_estimated_fetched_chunks_per_query_multiplier" category:"experimental"` @@ -282,7 +288,11 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&l.OutOfOrderBlocksExternalLabelEnabled, "ingester.out-of-order-blocks-external-label-enabled", false, "Whether the shipper should label out-of-order blocks with an external label before uploading them. Setting this label will compact out-of-order blocks separately from non-out-of-order blocks") f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") +<<<<<<< HEAD +======= + f.StringVar(&l.CostAttributionLabel, "validation.cost-attribution-label", "", "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'cost' label with cost label's value. Currently applies to the following metrics: cortex_discarded_samples_total") +>>>>>>> 3c422a8f57 (new service for tracking cost attribution) f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") diff --git a/pkg/util/validation/separate_metrics.go b/pkg/util/validation/separate_metrics.go index 48cefb65051..0f2384984b0 100644 --- a/pkg/util/validation/separate_metrics.go +++ b/pkg/util/validation/separate_metrics.go @@ -32,18 +32,13 @@ func GroupLabel(o *Overrides, userID string, timeseries []mimirpb.PreallocTimese } // AttributionLabel obtains the value of cost attribution label for tenant -func AttributionValue(o *Overrides, userID string, lbs []mimirpb.LabelAdapter) string { +func AttributionValue(attribLab string, userID string, lbs []mimirpb.LabelAdapter) string { if len(lbs) == 0 { return "" } - attributionLabel := o.CostAttributionLabel(userID) - if attributionLabel == "" { - // If not set, no cost attribution is required - return attributionLabel - } for _, label := range lbs { - if label.Name == attributionLabel { + if label.Name == attribLab { return label.Value } } From 2dbf56277478d1adfd89bcce20a6d60198592dcb Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 25 Sep 2024 14:43:24 +0200 Subject: [PATCH 03/13] address comments --- cmd/mimir/config-descriptor.json | 24 +++++----- cmd/mimir/help-all.txt.tmpl | 6 +-- pkg/distributor/distributor.go | 23 ++++++---- pkg/distributor/distributor_test.go | 2 +- .../activeseries/active_labels_test.go | 4 ++ .../active_native_histogram_postings_test.go | 20 ++++++++ .../activeseries/active_postings_test.go | 12 +++++ .../activeseries/active_series_test.go | 46 +++++++++++++++++-- pkg/ingester/ingester.go | 46 ++++++++++--------- pkg/mimir/mimir.go | 2 - pkg/mimir/modules.go | 2 +- pkg/util/cost_attribution.go | 37 +++++++-------- pkg/util/validation/limits.go | 19 +++++++- pkg/util/validation/separate_metrics.go | 14 ------ 14 files changed, 169 insertions(+), 88 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 2166f244176..a8222277d87 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -57,17 +57,6 @@ "fieldType": "int", "fieldCategory": "experimental" }, - { - "kind": "field", - "name": "max_cost_attribution_per_user", - "required": false, - "desc": "Maximum number of cost attributions allowed per user.", - "fieldValue": null, - "fieldDefaultValue": 200, - "fieldFlag": "max-cost-attribution-per-user", - "fieldType": "int", - "fieldCategory": "experimental" - }, { "kind": "field", "name": "cost_attribution_eviction_interval", @@ -4010,13 +3999,24 @@ "kind": "field", "name": "cost_attribution_label", "required": false, - "desc": "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'cost' label with cost label's value. Currently applies to the following metrics: cortex_discarded_samples_total", + "desc": "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total.", "fieldValue": null, "fieldDefaultValue": "", "fieldFlag": "validation.cost-attribution-label", "fieldType": "string", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "max_cost_attribution_per_user", + "required": false, + "desc": "The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.max-cost-attribution-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "max_fetched_chunks_per_query", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 9e191292ae2..122fa61a989 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1679,8 +1679,6 @@ Usage of ./cmd/mimir/mimir: [experimental] Burst size, i.e., maximum number of messages that can be logged at once, temporarily exceeding the configured maximum logs per second. (default 1000) -log.rate-limit-logs-per-second float [experimental] Maximum number of messages per second to be logged. (default 10000) - -max-cost-attribution-per-user int - [experimental] Maximum number of cost attributions allowed per user. (default 200) -max-separate-metrics-groups-per-user int [experimental] Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated. (default 1000) -mem-ballast-size-bytes int @@ -3064,11 +3062,13 @@ Usage of ./cmd/mimir/mimir: -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") -validation.cost-attribution-label string - [experimental] Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'cost' label with cost label's value. Currently applies to the following metrics: cortex_discarded_samples_total + [experimental] Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) + -validation.max-cost-attribution-per-user int + [experimental] The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution. -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index bcd2a5f3b1e..de122b57816 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -105,7 +105,7 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - costAttributionsvr *util.CostAttributionCleanupService + costAttributionSvc *util.CostAttributionCleanupService // For handling HA replicas. HATracker *haTracker @@ -306,10 +306,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, - activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionClenaupService *util.CostAttributionCleanupService, - ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, - canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionClenaupService *util.CostAttributionCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -344,7 +341,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove healthyInstancesCount: atomic.NewUint32(0), limits: limits, HATracker: haTracker, - costAttributionsvr: costAttributionClenaupService, + costAttributionSvc: costAttributionClenaupService, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -1442,7 +1439,9 @@ func (d *Distributor) push(ctx context.Context, pushReq *Request) error { return err } - d.updateReceivedMetrics(req, userID, d.limits.CostAttributionLabel(userID)) + now := mtime.Now() + + d.updateReceivedMetrics(req, userID, d.limits.CostAttributionLabel(userID), now) if len(req.Timeseries) == 0 && len(req.Metadata) == 0 { return nil @@ -1673,14 +1672,18 @@ func tokenForMetadata(userID string, metricName string) uint32 { return mimirpb.ShardByMetricName(userID, metricName) } -func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string, costAttributionLabel string) { +func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string, costAttributionLabel string, now time.Time) { var receivedSamples, receivedExemplars, receivedMetadata int - costAttribution := make(map[string]int) + costAttributionSize := 0 + if costAttributionLabel != "" { + costAttributionSize = d.limits.MaxCostAttributionPerUser(userID) + } + costAttribution := make(map[string]int, costAttributionSize) for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) if costAttributionLabel != "" { - attribution := d.costAttributionsvr.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel), mtime.Now()) + attribution := d.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel), now, costAttributionSize) costAttribution[attribution]++ } } diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 1e28bc7a949..a1ce70b17fb 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -7956,7 +7956,7 @@ func TestCheckStartedMiddleware(t *testing.T) { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - distributor, err := New(distributorConfig, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorConfig, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(t, err) ctx := user.InjectOrgID(context.Background(), "user") diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index fe3c8e4f78a..7580ebbc31c 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,11 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) memPostings := index.NewMemPostings() for i, l := range series { diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 85f5272d385..f9d339e2dd5 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,12 +26,16 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD <<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") ======= activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) >>>>>>> 3c422a8f57 (new service for tracking cost attribution) +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -67,11 +71,15 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD <<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") ======= activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) >>>>>>> 3c422a8f57 (new service for tracking cost attribution) +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -115,11 +123,15 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD <<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") ======= activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) >>>>>>> 3c422a8f57 (new service for tracking cost attribution) +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -159,11 +171,15 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD <<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") ======= activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) >>>>>>> 3c422a8f57 (new service for tracking cost attribution) +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -200,11 +216,15 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD <<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") ======= activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) >>>>>>> 3c422a8f57 (new service for tracking cost attribution) +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index c619e043eb1..a08a02aa18a 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,7 +26,11 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -58,7 +62,11 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -90,7 +98,11 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) +<<<<<<< HEAD activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") +======= + activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index df5fe0f0ecf..8d21d3d3c9d 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -38,7 +38,11 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. +<<<<<<< HEAD c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") +======= + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) valid := c.Purge(time.Now()) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() @@ -203,7 +207,11 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) +<<<<<<< HEAD c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") +======= + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { @@ -230,7 +238,11 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := NewMatchers(mustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) +<<<<<<< HEAD c := NewActiveSeries(asm, DefaultTimeout, "foo", "") +======= + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) testUpdateSeries(t, c) } @@ -447,7 +459,11 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := NewMatchers(mustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) +<<<<<<< HEAD c := NewActiveSeries(asm, DefaultTimeout, "foo", "") +======= + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) testUpdateSeries(t, c) c.Clear() @@ -488,7 +504,11 @@ func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) +<<<<<<< HEAD c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") +======= + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) c.UpdateSeries(ls1, ref1, time.Now(), -1) c.UpdateSeries(ls2, ref2, time.Now(), -1) @@ -516,7 +536,11 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) +<<<<<<< HEAD c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") +======= + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) for i := 0; i < len(series); i++ { c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) @@ -562,7 +586,7 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute, "foo", "", nil) + c := NewActiveSeries(asm, 5*time.Minute, "foo", "", nil, 0) exp := len(series) - ttl expMatchingSeries := 0 @@ -595,7 +619,11 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() +<<<<<<< HEAD c := NewActiveSeries(&Matchers{}, 59*time.Second, "foo", "") +======= + c := NewActiveSeries(&Matchers{}, 59*time.Second, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) c.UpdateSeries(ls2, ref2, currentTime, -1) @@ -631,7 +659,7 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) valid := c.Purge(currentTime) assert.True(t, valid) @@ -697,7 +725,7 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -736,7 +764,7 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -789,7 +817,11 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. +<<<<<<< HEAD c = NewActiveSeries(&Matchers{}, 0, "foo", "") +======= + c = NewActiveSeries(&Matchers{}, 0, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -927,7 +959,7 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil) + c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) @@ -952,7 +984,11 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() +<<<<<<< HEAD c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") +======= + c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) +>>>>>>> 7e628c3508 (address comments) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 04a61275fb1..caa71fef859 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -310,7 +310,7 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService - costAttribution *util.CostAttributionCleanupService + costAttributionSvc *util.CostAttributionCleanupService tsdbMetrics *tsdbMetrics @@ -380,12 +380,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New( - cfg Config, limits *validation.Overrides, - ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, - activeGroupsCleanupService *util.ActiveGroupsCleanupService, - costAttributionCleanupService *util.CostAttributionCleanupService, - registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionCleanupService *util.CostAttributionCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -393,7 +388,7 @@ func New( i.ingestionRate = util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval) i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService - i.costAttribution = costAttributionCleanupService + i.costAttributionSvc = costAttributionCleanupService // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -1170,7 +1165,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // successfully committed stats = pushStats{ - failedSamplesAttribution: make(map[string]int), + failedSamplesAttribution: make(map[string]int, i.limits.MaxCostAttributionPerUser(userID)), } firstPartialErr error @@ -1288,10 +1283,8 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats db.ingestedAPISamples.Add(int64(stats.succeededSamplesCount)) } } - if stats.failedSamplesAttribution != nil && len(stats.failedSamplesAttribution) > 0 { - for label, count := range stats.failedSamplesAttribution { - discarded.samplesPerAttribution.WithLabelValues(userID, label).Add(float64(count)) - } + for label, count := range stats.failedSamplesAttribution { + discarded.samplesPerAttribution.WithLabelValues(userID, label).Add(float64(count)) } } @@ -1302,12 +1295,19 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats *pushStats, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { // Return true if handled as soft error, and we can ingest more series. + // get the cost attribution value for the series + costLabel := i.limits.CostAttributionLabel(userID) handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { - - // get the cost attribution value for the series - costLabel := i.limits.CostAttributionLabel(userID) if costLabel != "" { - costAttrib := i.costAttribution.UpdateAttributionTimestamp(userID, validation.AttributionValue(costLabel, userID, labels), time.Now()) + costAttrib := "" + for _, label := range labels { + if label.Name == costLabel { + costAttrib = label.Value + } + } + // get the label value and update the timestamp, + // if the cordianlity is reached or we are currently in cooldown period, function would returned __unaccounted__ + costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend, i.limits.MaxCostAttributionPerUser(userID)) stats.failedSamplesAttribution[costAttrib]++ } @@ -1415,12 +1415,15 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels for _, ts := range timeseries { - // The cost attribution value for the series - costLabel := i.limits.CostAttributionLabel(userID) var costAttrib string // when cost attribution label is set if costLabel != "" { - costAttrib = i.costAttribution.UpdateAttributionTimestamp(userID, validation.AttributionValue(costLabel, userID, ts.Labels), time.Now()) + for _, label := range ts.Labels { + if label.Name == costLabel { + costAttrib = label.Value + } + } + costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend, i.limits.MaxCostAttributionPerUser(userID)) } // The labels must be sorted (in our case, it's guaranteed a write request @@ -2678,7 +2681,8 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD i.cfg.ActiveSeriesMetrics.IdleTimeout, userID, i.limits.CostAttributionLabel(userID), - i.costAttribution, + i.costAttributionSvc, + i.limits.MaxCostAttributionPerUser(userID), ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index a2e3710300b..02d4f3afa7b 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -109,7 +109,6 @@ type Config struct { NoAuthTenant string `yaml:"no_auth_tenant" category:"advanced"` ShutdownDelay time.Duration `yaml:"shutdown_delay" category:"advanced"` MaxSeparateMetricsGroupsPerUser int `yaml:"max_separate_metrics_groups_per_user" category:"experimental"` - MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" category:"experimental"` CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` EnableGoRuntimeMetrics bool `yaml:"enable_go_runtime_metrics" category:"advanced"` PrintConfig bool `yaml:"-"` @@ -172,7 +171,6 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Mimir will report not-ready status via /ready endpoint.") f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 10*time.Minute, "Interval at which to evict inactive cost attributions.") f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") - f.IntVar(&c.MaxCostAttributionPerUser, "max-cost-attribution-per-user", 200, "Maximum number of cost attributions allowed per user.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 7cdb50d8c64..00821498c1a 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -650,7 +650,7 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { } func (t *Mimir) initCostAttributionService() (services.Service, error) { - t.CostAttributionCleanup = util.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, t.Cfg.MaxCostAttributionPerUser, util_log.Logger) + t.CostAttributionCleanup = util.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger) return t.CostAttributionCleanup, nil } diff --git a/pkg/util/cost_attribution.go b/pkg/util/cost_attribution.go index 04d2a2ec7b6..d8ab19c74de 100644 --- a/pkg/util/cost_attribution.go +++ b/pkg/util/cost_attribution.go @@ -4,6 +4,7 @@ package util import ( "context" + "fmt" "sync" "time" @@ -14,17 +15,15 @@ import ( ) type CostAttribution struct { - mu sync.RWMutex - timestampsPerUser map[string]map[string]*atomic.Int64 // map[user][group] -> timestamp - coolDownDeadline map[string]*atomic.Int64 - maxAttributionPerUser int + mu sync.RWMutex + timestampsPerUser map[string]map[string]*atomic.Int64 // map[user][group] -> timestamp + coolDownDeadline map[string]*atomic.Int64 } -func NewCostAttribution(maxAttributionPerUser int) *CostAttribution { +func NewCostAttribution() *CostAttribution { return &CostAttribution{ - timestampsPerUser: map[string]map[string]*atomic.Int64{}, - coolDownDeadline: map[string]*atomic.Int64{}, - maxAttributionPerUser: maxAttributionPerUser, + timestampsPerUser: map[string]map[string]*atomic.Int64{}, + coolDownDeadline: map[string]*atomic.Int64{}, } } @@ -110,7 +109,7 @@ func (ca *CostAttribution) purgeInactiveAttributions(inactiveTimeout time.Durati } } -func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, now time.Time) bool { +func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, now time.Time, limit int) bool { // if we are still at the cooldown period, we will consider the limit reached ca.mu.RLock() defer ca.mu.RUnlock() @@ -126,7 +125,7 @@ func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, } // if the user has reached the limit, we will set the cooldown period which is 20 minutes - maxReached := len(ca.timestampsPerUser[userID]) >= ca.maxAttributionPerUser + maxReached := len(ca.timestampsPerUser[userID]) >= limit if maxReached { ca.coolDownDeadline[userID].Store(time.Now().Add(20 * time.Minute).UnixNano()) return true @@ -141,32 +140,34 @@ type CostAttributionCleanupService struct { costAttribution *CostAttribution cleanupFuncs []func(userID, attribution string) inactiveTimeout time.Duration + invalidValue string } type CostAttributionMetricsCleaner interface { RemoveAttributionMetricsForUser(userID, attribution string) } -func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, maxAttributionPerUser int, logger log.Logger, cleanupFns ...func(string, string)) *CostAttributionCleanupService { +func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, cleanupFns ...func(string, string)) *CostAttributionCleanupService { s := &CostAttributionCleanupService{ - costAttribution: NewCostAttribution(maxAttributionPerUser), + costAttribution: NewCostAttribution(), cleanupFuncs: cleanupFns, inactiveTimeout: inactiveTimeout, logger: logger, + invalidValue: "__unaccounted__", } s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution cleanup") return s } -func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user, attribution string, now time.Time) string { +func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user, attribution string, now time.Time, limit int) string { // empty label is not normal, if user set attribution label, the metrics send has to include the label if attribution == "" { - attribution = "other" - level.Error(s.logger).Log("msg", "set attribution label to \"other\" since missing cost attribution label in metrics") - } else if s.costAttribution.attributionLimitExceeded(user, attribution, now) { - attribution = "other" - level.Error(s.logger).Log("msg", "set attribution label to \"other\" since user has reached the limit of cost attribution labels") + attribution = s.invalidValue + level.Error(s.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since missing cost attribution label in metrics", s.invalidValue)) + } else if s.costAttribution.attributionLimitExceeded(user, attribution, now, limit) { + attribution = s.invalidValue + level.Error(s.logger).Log("msg", "set attribution label to \"%s\" since user has reached the limit of cost attribution labels", s.invalidValue) } s.costAttribution.UpdateAttributionTimestampForUser(user, attribution, now) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index d49b7d6b40d..30483c4d750 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -146,7 +146,8 @@ type Limits struct { <<<<<<< HEAD ======= // User defined label to give the cost distribution by values of the label - CostAttributionLabel string `yaml:"cost_attribution_label" json:"cost_attribution_label" category:"experimental"` + CostAttributionLabel string `yaml:"cost_attribution_label" json:"cost_attribution_label" category:"experimental"` + MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" json:"max_cost_attribution_per_user" category:"experimental"` >>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Querier enforced limits. @@ -289,10 +290,15 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") <<<<<<< HEAD +<<<<<<< HEAD ======= f.StringVar(&l.CostAttributionLabel, "validation.cost-attribution-label", "", "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'cost' label with cost label's value. Currently applies to the following metrics: cortex_discarded_samples_total") >>>>>>> 3c422a8f57 (new service for tracking cost attribution) +======= + f.StringVar(&l.CostAttributionLabel, "validation.cost-attribution-label", "", "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total.") + f.IntVar(&l.MaxCostAttributionPerUser, "validation.max-cost-attribution-per-user", 0, "The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution.") +>>>>>>> 7e628c3508 (address comments) f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -780,6 +786,17 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } +<<<<<<< HEAD +======= +func (o *Overrides) CostAttributionLabel(userID string) string { + return o.getOverridesForUser(userID).CostAttributionLabel +} + +func (o *Overrides) MaxCostAttributionPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionPerUser +} + +>>>>>>> 7e628c3508 (address comments) // IngestionTenantShardSize returns the ingesters shard size for a given user. func (o *Overrides) IngestionTenantShardSize(userID string) int { return o.getOverridesForUser(userID).IngestionTenantShardSize diff --git a/pkg/util/validation/separate_metrics.go b/pkg/util/validation/separate_metrics.go index 0f2384984b0..cc6194a42e4 100644 --- a/pkg/util/validation/separate_metrics.go +++ b/pkg/util/validation/separate_metrics.go @@ -30,17 +30,3 @@ func GroupLabel(o *Overrides, userID string, timeseries []mimirpb.PreallocTimese return "" } - -// AttributionLabel obtains the value of cost attribution label for tenant -func AttributionValue(attribLab string, userID string, lbs []mimirpb.LabelAdapter) string { - if len(lbs) == 0 { - return "" - } - - for _, label := range lbs { - if label.Name == attribLab { - return label.Value - } - } - return "" -} From 788216a8c0b4b7993d8cd4f28d5adde8baf7d8da Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 27 Sep 2024 14:27:40 +0200 Subject: [PATCH 04/13] Rebase on main and fix tests --- pkg/distributor/distributor.go | 7 +- .../activeseries/active_labels_test.go | 6 +- .../active_native_histogram_postings_test.go | 51 ++----------- .../activeseries/active_postings_test.go | 18 +---- pkg/ingester/activeseries/active_series.go | 71 +++++++++---------- .../activeseries/active_series_test.go | 54 +++----------- pkg/ingester/ingester.go | 12 ++-- pkg/mimir/mimir.go | 3 +- pkg/mimir/modules.go | 3 +- .../{ => costattribution}/cost_attribution.go | 21 +++--- pkg/util/validation/limits.go | 14 ---- 11 files changed, 79 insertions(+), 181 deletions(-) rename pkg/util/{ => costattribution}/cost_attribution.go (87%) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index de122b57816..05f8160d15d 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -53,6 +53,7 @@ import ( "github.com/grafana/mimir/pkg/querier/stats" "github.com/grafana/mimir/pkg/storage/ingest" "github.com/grafana/mimir/pkg/util" + "github.com/grafana/mimir/pkg/util/costattribution" "github.com/grafana/mimir/pkg/util/globalerror" mimir_limiter "github.com/grafana/mimir/pkg/util/limiter" util_math "github.com/grafana/mimir/pkg/util/math" @@ -105,7 +106,7 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - costAttributionSvc *util.CostAttributionCleanupService + costAttributionSvc *costattribution.CostAttributionCleanupService // For handling HA replicas. HATracker *haTracker @@ -306,7 +307,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionClenaupService *util.CostAttributionCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionClenaupService *costattribution.CostAttributionCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -1683,7 +1684,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) if costAttributionLabel != "" { - attribution := d.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel), now, costAttributionSize) + attribution := d.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel), now) costAttribution[attribution]++ } } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index 7580ebbc31c..dc919618a84 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,11 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} -<<<<<<< HEAD - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) memPostings := index.NewMemPostings() for i, l := range series { diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index f9d339e2dd5..e6cc823e502 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,16 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD -<<<<<<< HEAD - - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) ->>>>>>> 3c422a8f57 (new service for tracking cost attribution) -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -71,15 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD -<<<<<<< HEAD - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) ->>>>>>> 3c422a8f57 (new service for tracking cost attribution) -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -123,15 +106,7 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD -<<<<<<< HEAD - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) ->>>>>>> 3c422a8f57 (new service for tracking cost attribution) -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -171,15 +146,7 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD -<<<<<<< HEAD - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) ->>>>>>> 3c422a8f57 (new service for tracking cost attribution) -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -216,15 +183,7 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD -<<<<<<< HEAD - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil) ->>>>>>> 3c422a8f57 (new service for tracking cost attribution) -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index a08a02aa18a..5c6d2711212 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,11 +26,7 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -62,11 +58,7 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -98,11 +90,7 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) -<<<<<<< HEAD - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "") -======= - activeSeries := NewActiveSeries(&Matchers{}, time.Duration(ttl), "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 925cb191a95..77fbd3770f3 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -11,12 +11,14 @@ import ( "sync" "time" - "github.com/grafana/mimir/pkg/util" + "github.com/grafana/mimir/pkg/util/costattribution" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" + + asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) const ( @@ -45,22 +47,22 @@ type ActiveSeries struct { // matchersMutex protects matchers and lastMatchersUpdate. matchersMutex sync.RWMutex - matchers *Matchers + matchers *asmodel.Matchers lastMatchersUpdate time.Time - costAttributionLabel string - costAttributionSvc *util.CostAttributionCleanupService - maxCostAttributionPerUser int + costAttributionLabel string + costAttributionSvc *costattribution.CostAttributionCleanupService // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. - timeout time.Duration - userID string + timeout time.Duration + userID string + maxCostAttributionPerUser int } // seriesStripe holds a subset of the series timestamps for a single tenant. type seriesStripe struct { - matchers *Matchers + matchers *asmodel.Matchers deleted *deletedSeries @@ -68,8 +70,7 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - costAttributionSvc *util.CostAttributionCleanupService - maxCostAttributionPerUser int + costAttributionSvc *costattribution.CostAttributionCleanupService mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -87,20 +88,20 @@ type seriesStripe struct { // seriesEntry holds a timestamp for single series. type seriesEntry struct { - nanos *atomic.Int64 // Unix timestamp in nanoseconds. Needs to be a pointer because we don't store pointers to entries in the stripe. - matches preAllocDynamicSlice // Index of the matcher matching - numNativeHistogramBuckets int // Number of buckets in native histogram series, -1 if not a native histogram. + nanos *atomic.Int64 // Unix timestamp in nanoseconds. Needs to be a pointer because we don't store pointers to entries in the stripe. + matches asmodel.PreAllocDynamicSlice // Index of the matcher matching + numNativeHistogramBuckets int // Number of buckets in native histogram series, -1 if not a native histogram. // keep the value corresponding the label configured in serieStripe deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. attributionValue string } func NewActiveSeries( - asm *Matchers, + asm *asmodel.Matchers, timeout time.Duration, userID string, costAttributionLabel string, - costAttributionSvc *util.CostAttributionCleanupService, + costAttributionSvc *costattribution.CostAttributionCleanupService, maxCostAttributionPerUser int, ) *ActiveSeries { c := &ActiveSeries{ @@ -112,7 +113,7 @@ func NewActiveSeries( // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionLabel, costAttributionSvc, maxCostAttributionPerUser) + c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionLabel, costAttributionSvc) } return c @@ -124,18 +125,18 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { return c.matchers.MatcherNames() } -func (c *ActiveSeries) ReloadMatchers(asm *Matchers, now time.Time) { +func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { c.matchersMutex.Lock() defer c.matchersMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionLabel, c.costAttributionSvc, c.maxCostAttributionPerUser) + c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionLabel, c.costAttributionSvc) } c.matchers = asm c.lastMatchersUpdate = now } -func (c *ActiveSeries) CurrentConfig() CustomTrackersConfig { +func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { c.matchersMutex.RLock() defer c.matchersMutex.RUnlock() return c.matchers.Config() @@ -372,21 +373,21 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef entry, ok := s.refs[ref] if ok { if entry.numNativeHistogramBuckets != numNativeHistogramBuckets { - matches := s.matchers.matches(series) - matchesLen := matches.len() + matches := s.matchers.Matches(series) + matchesLen := matches.Len() if numNativeHistogramBuckets >= 0 && entry.numNativeHistogramBuckets >= 0 { // change number of buckets but still a histogram diff := numNativeHistogramBuckets - entry.numNativeHistogramBuckets s.activeNativeHistogramBuckets = uint32(int(s.activeNativeHistogramBuckets) + diff) for i := 0; i < matchesLen; i++ { - s.activeMatchingNativeHistogramBuckets[matches.get(i)] = uint32(int(s.activeMatchingNativeHistogramBuckets[matches.get(i)]) + diff) + s.activeMatchingNativeHistogramBuckets[matches.Get(i)] = uint32(int(s.activeMatchingNativeHistogramBuckets[matches.Get(i)]) + diff) } } else if numNativeHistogramBuckets >= 0 { // change from float to histogram s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(numNativeHistogramBuckets) for i := 0; i < matchesLen; i++ { - match := matches.get(i) + match := matches.Get(i) s.activeMatchingNativeHistograms[match]++ s.activeMatchingNativeHistogramBuckets[match] += uint32(numNativeHistogramBuckets) } @@ -395,7 +396,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) for i := 0; i < matchesLen; i++ { - match := matches.get(i) + match := matches.Get(i) s.activeMatchingNativeHistograms[match]-- s.activeMatchingNativeHistogramBuckets[match] -= uint32(entry.numNativeHistogramBuckets) } @@ -406,8 +407,8 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef return entry.nanos, false } - matches := s.matchers.matches(series) - matchesLen := matches.len() + matches := s.matchers.Matches(series) + matchesLen := matches.Len() s.active++ @@ -416,7 +417,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef s.activeNativeHistogramBuckets += uint32(numNativeHistogramBuckets) } for i := 0; i < matchesLen; i++ { - match := matches.get(i) + match := matches.Get(i) s.activeMatching[match]++ if numNativeHistogramBuckets >= 0 { s.activeMatchingNativeHistograms[match]++ @@ -433,7 +434,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef // here if we have a cost attribution label, we can split the serie count based on the value of the label // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly if s.costAttributionLabel != "" { - attributionValue := s.costAttributionSvc.UpdateAttributionTimestamp(s.userID, series.Get(s.costAttributionLabel), time.Unix(0, nowNanos), s.maxCostAttributionPerUser) + attributionValue := s.costAttributionSvc.UpdateAttributionTimestamp(s.userID, series.Get(s.costAttributionLabel), time.Unix(0, nowNanos)) s.costAttributionValues[attributionValue]++ e.attributionValue = attributionValue } @@ -462,12 +463,11 @@ func (s *seriesStripe) clear() { // Reinitialize assigns new matchers and corresponding size activeMatching slices. func (s *seriesStripe) reinitialize( - asm *Matchers, + asm *asmodel.Matchers, deleted *deletedSeries, userID string, costAttributionLabel string, - costAttributionSvc *util.CostAttributionCleanupService, - maxCostAttributionPerUser int, + costAttributionSvc *costattribution.CostAttributionCleanupService, ) { s.mu.Lock() defer s.mu.Unlock() @@ -478,7 +478,6 @@ func (s *seriesStripe) reinitialize( s.costAttributionValues = map[string]uint32{} s.activeNativeHistograms = 0 s.activeNativeHistogramBuckets = 0 - s.maxCostAttributionPerUser = maxCostAttributionPerUser s.matchers = asm s.userID = userID s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) @@ -528,9 +527,9 @@ func (s *seriesStripe) purge(keepUntil time.Time) { if entry.attributionValue != "" { s.costAttributionValues[entry.attributionValue]++ } - ml := entry.matches.len() + ml := entry.matches.Len() for i := 0; i < ml; i++ { - match := entry.matches.get(i) + match := entry.matches.Get(i) s.activeMatching[match]++ if entry.numNativeHistogramBuckets >= 0 { s.activeMatchingNativeHistograms[match]++ @@ -573,9 +572,9 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) } - ml := entry.matches.len() + ml := entry.matches.Len() for i := 0; i < ml; i++ { - match := entry.matches.get(i) + match := entry.matches.Get(i) s.activeMatching[match]-- if entry.numNativeHistogramBuckets >= 0 { s.activeMatchingNativeHistograms[match]-- diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index 8d21d3d3c9d..0aa2b8a09e0 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -38,11 +38,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. -<<<<<<< HEAD - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") -======= - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) valid := c.Purge(time.Now()) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() @@ -207,11 +203,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) -<<<<<<< HEAD - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") -======= - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { @@ -237,12 +229,8 @@ func TestActiveSeries_ContainsRef(t *testing.T) { } func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { - asm := NewMatchers(mustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) -<<<<<<< HEAD - c := NewActiveSeries(asm, DefaultTimeout, "foo", "") -======= + asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) testUpdateSeries(t, c) } @@ -458,12 +446,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { } func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { - asm := NewMatchers(mustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) -<<<<<<< HEAD - c := NewActiveSeries(asm, DefaultTimeout, "foo", "") -======= + asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) testUpdateSeries(t, c) c.Clear() @@ -504,11 +488,7 @@ func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) -<<<<<<< HEAD - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") -======= - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) c.UpdateSeries(ls1, ref1, time.Now(), -1) c.UpdateSeries(ls2, ref2, time.Now(), -1) @@ -536,11 +516,7 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) -<<<<<<< HEAD - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") -======= - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) for i := 0; i < len(series); i++ { c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) @@ -619,11 +595,7 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() -<<<<<<< HEAD - c := NewActiveSeries(&Matchers{}, 59*time.Second, "foo", "") -======= - c := NewActiveSeries(&Matchers{}, 59*time.Second, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, "foo", "", nil, 0) c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) c.UpdateSeries(ls2, ref2, currentTime, -1) @@ -817,11 +789,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. -<<<<<<< HEAD - c = NewActiveSeries(&Matchers{}, 0, "foo", "") -======= - c = NewActiveSeries(&Matchers{}, 0, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + c = NewActiveSeries(&asmodel.Matchers{}, 0, "foo", "", nil, 0) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -984,11 +952,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() -<<<<<<< HEAD - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "") -======= - c := NewActiveSeries(&Matchers{}, DefaultTimeout, "foo", "", nil, 0) ->>>>>>> 7e628c3508 (address comments) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index caa71fef859..e697c6f1060 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -63,6 +63,7 @@ import ( "github.com/grafana/mimir/pkg/storage/tsdb/block" "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" + "github.com/grafana/mimir/pkg/util/costattribution" "github.com/grafana/mimir/pkg/util/globalerror" "github.com/grafana/mimir/pkg/util/limiter" util_log "github.com/grafana/mimir/pkg/util/log" @@ -310,7 +311,7 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService - costAttributionSvc *util.CostAttributionCleanupService + costAttributionSvc *costattribution.CostAttributionCleanupService tsdbMetrics *tsdbMetrics @@ -374,13 +375,12 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus forceCompactTrigger: make(chan requestWithUsersAndCallback), shipTrigger: make(chan requestWithUsersAndCallback), seriesHashCache: hashcache.NewSeriesHashCache(cfg.BlocksStorageConfig.TSDB.SeriesHashCacheMaxBytes), - - errorSamplers: newIngesterErrSamplers(cfg.ErrorSampleRate), + errorSamplers: newIngesterErrSamplers(cfg.ErrorSampleRate), }, nil } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionCleanupService *util.CostAttributionCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionCleanupService *costattribution.CostAttributionCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -1307,7 +1307,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } // get the label value and update the timestamp, // if the cordianlity is reached or we are currently in cooldown period, function would returned __unaccounted__ - costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend, i.limits.MaxCostAttributionPerUser(userID)) + costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend) stats.failedSamplesAttribution[costAttrib]++ } @@ -1423,7 +1423,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre costAttrib = label.Value } } - costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend, i.limits.MaxCostAttributionPerUser(userID)) + costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend) } // The labels must be sorted (in our case, it's guaranteed a write request diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 02d4f3afa7b..1443cb22705 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -74,6 +74,7 @@ import ( "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" "github.com/grafana/mimir/pkg/util/activitytracker" + "github.com/grafana/mimir/pkg/util/costattribution" util_log "github.com/grafana/mimir/pkg/util/log" "github.com/grafana/mimir/pkg/util/noauth" "github.com/grafana/mimir/pkg/util/process" @@ -712,7 +713,7 @@ type Mimir struct { TenantLimits validation.TenantLimits Overrides *validation.Overrides ActiveGroupsCleanup *util.ActiveGroupsCleanupService - CostAttributionCleanup *util.CostAttributionCleanupService + CostAttributionCleanup *costattribution.CostAttributionCleanupService Distributor *distributor.Distributor Ingester *ingester.Ingester diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 00821498c1a..35f0a8040f0 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -60,6 +60,7 @@ import ( "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" "github.com/grafana/mimir/pkg/util/activitytracker" + "github.com/grafana/mimir/pkg/util/costattribution" util_log "github.com/grafana/mimir/pkg/util/log" "github.com/grafana/mimir/pkg/util/validation" "github.com/grafana/mimir/pkg/util/validation/exporter" @@ -650,7 +651,7 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { } func (t *Mimir) initCostAttributionService() (services.Service, error) { - t.CostAttributionCleanup = util.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger) + t.CostAttributionCleanup = costattribution.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides) return t.CostAttributionCleanup, nil } diff --git a/pkg/util/cost_attribution.go b/pkg/util/costattribution/cost_attribution.go similarity index 87% rename from pkg/util/cost_attribution.go rename to pkg/util/costattribution/cost_attribution.go index d8ab19c74de..6af2f39fefd 100644 --- a/pkg/util/cost_attribution.go +++ b/pkg/util/costattribution/cost_attribution.go @@ -1,6 +1,6 @@ // SPDX-License-Identifier: AGPL-3.0-only -package util +package costattribution import ( "context" @@ -11,6 +11,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/services" + "github.com/grafana/mimir/pkg/util/validation" "go.uber.org/atomic" ) @@ -18,12 +19,14 @@ type CostAttribution struct { mu sync.RWMutex timestampsPerUser map[string]map[string]*atomic.Int64 // map[user][group] -> timestamp coolDownDeadline map[string]*atomic.Int64 + limits *validation.Overrides } -func NewCostAttribution() *CostAttribution { +func NewCostAttribution(limits *validation.Overrides) *CostAttribution { return &CostAttribution{ timestampsPerUser: map[string]map[string]*atomic.Int64{}, coolDownDeadline: map[string]*atomic.Int64{}, + limits: limits, } } @@ -109,7 +112,7 @@ func (ca *CostAttribution) purgeInactiveAttributions(inactiveTimeout time.Durati } } -func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, now time.Time, limit int) bool { +func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, now time.Time) bool { // if we are still at the cooldown period, we will consider the limit reached ca.mu.RLock() defer ca.mu.RUnlock() @@ -125,7 +128,7 @@ func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, } // if the user has reached the limit, we will set the cooldown period which is 20 minutes - maxReached := len(ca.timestampsPerUser[userID]) >= limit + maxReached := len(ca.timestampsPerUser[userID]) >= ca.limits.MaxCostAttributionPerUser(userID) if maxReached { ca.coolDownDeadline[userID].Store(time.Now().Add(20 * time.Minute).UnixNano()) return true @@ -147,9 +150,9 @@ type CostAttributionMetricsCleaner interface { RemoveAttributionMetricsForUser(userID, attribution string) } -func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, cleanupFns ...func(string, string)) *CostAttributionCleanupService { +func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, cleanupFns ...func(string, string)) *CostAttributionCleanupService { s := &CostAttributionCleanupService{ - costAttribution: NewCostAttribution(), + costAttribution: NewCostAttribution(limits), cleanupFuncs: cleanupFns, inactiveTimeout: inactiveTimeout, logger: logger, @@ -160,14 +163,14 @@ func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Dura return s } -func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user, attribution string, now time.Time, limit int) string { +func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user, attribution string, now time.Time) string { // empty label is not normal, if user set attribution label, the metrics send has to include the label if attribution == "" { attribution = s.invalidValue level.Error(s.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since missing cost attribution label in metrics", s.invalidValue)) - } else if s.costAttribution.attributionLimitExceeded(user, attribution, now, limit) { + } else if s.costAttribution.attributionLimitExceeded(user, attribution, now) { attribution = s.invalidValue - level.Error(s.logger).Log("msg", "set attribution label to \"%s\" since user has reached the limit of cost attribution labels", s.invalidValue) + level.Error(s.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since user has reached the limit of cost attribution labels", s.invalidValue)) } s.costAttribution.UpdateAttributionTimestampForUser(user, attribution, now) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 30483c4d750..1d14d03baca 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -143,13 +143,10 @@ type Limits struct { // User defined label to give the option of subdividing specific metrics by another label SeparateMetricsGroupLabel string `yaml:"separate_metrics_group_label" json:"separate_metrics_group_label" category:"experimental"` -<<<<<<< HEAD -======= // User defined label to give the cost distribution by values of the label CostAttributionLabel string `yaml:"cost_attribution_label" json:"cost_attribution_label" category:"experimental"` MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" json:"max_cost_attribution_per_user" category:"experimental"` ->>>>>>> 3c422a8f57 (new service for tracking cost attribution) // Querier enforced limits. MaxChunksPerQuery int `yaml:"max_fetched_chunks_per_query" json:"max_fetched_chunks_per_query"` MaxEstimatedChunksPerQueryMultiplier float64 `yaml:"max_estimated_fetched_chunks_per_query_multiplier" json:"max_estimated_fetched_chunks_per_query_multiplier" category:"experimental"` @@ -289,16 +286,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&l.OutOfOrderBlocksExternalLabelEnabled, "ingester.out-of-order-blocks-external-label-enabled", false, "Whether the shipper should label out-of-order blocks with an external label before uploading them. Setting this label will compact out-of-order blocks separately from non-out-of-order blocks") f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") -<<<<<<< HEAD -<<<<<<< HEAD - -======= - f.StringVar(&l.CostAttributionLabel, "validation.cost-attribution-label", "", "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'cost' label with cost label's value. Currently applies to the following metrics: cortex_discarded_samples_total") ->>>>>>> 3c422a8f57 (new service for tracking cost attribution) -======= f.StringVar(&l.CostAttributionLabel, "validation.cost-attribution-label", "", "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total.") f.IntVar(&l.MaxCostAttributionPerUser, "validation.max-cost-attribution-per-user", 0, "The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution.") ->>>>>>> 7e628c3508 (address comments) f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -786,8 +775,6 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } -<<<<<<< HEAD -======= func (o *Overrides) CostAttributionLabel(userID string) string { return o.getOverridesForUser(userID).CostAttributionLabel } @@ -796,7 +783,6 @@ func (o *Overrides) MaxCostAttributionPerUser(userID string) int { return o.getOverridesForUser(userID).MaxCostAttributionPerUser } ->>>>>>> 7e628c3508 (address comments) // IngestionTenantShardSize returns the ingesters shard size for a given user. func (o *Overrides) IngestionTenantShardSize(userID string) int { return o.getOverridesForUser(userID).IngestionTenantShardSize From dd7e2a4fd478f80e5b83cc11aa12bcc8540faeae Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 2 Oct 2024 17:08:56 +0200 Subject: [PATCH 05/13] seperate registry for cost attribution metrics --- cmd/mimir/config-descriptor.json | 10 + pkg/distributor/distributor.go | 3 +- .../activeseries/active_labels_test.go | 2 +- .../active_native_histogram_postings_test.go | 10 +- .../activeseries/active_postings_test.go | 6 +- pkg/ingester/activeseries/active_series.go | 27 +-- .../activeseries/active_series_test.go | 28 +-- pkg/ingester/ingester.go | 17 +- pkg/mimir/mimir.go | 3 +- pkg/mimir/modules.go | 19 +- pkg/util/costattribution/cost_attribution.go | 219 +++++++++++++----- 11 files changed, 218 insertions(+), 126 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index a8222277d87..79bd4bb05af 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -18161,6 +18161,16 @@ "fieldValue": null, "fieldDefaultValue": null }, + { + "kind": "field", + "name": "custom_registry_path", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, { "kind": "field", "name": "timeseries_unmarshal_caching_optimization_enabled", diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 05f8160d15d..c14b5a35306 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -358,7 +358,6 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove Name: "cortex_distributor_received_samples_total", Help: "The total number of received samples, excluding rejected and deduped samples.", }, []string{"user", "attrib"}), - receivedExemplars: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_distributor_received_exemplars_total", Help: "The total number of received exemplars, excluding rejected and deduped exemplars.", @@ -1684,7 +1683,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) if costAttributionLabel != "" { - attribution := d.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels).Get(costAttributionLabel), now) + attribution := d.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), now) costAttribution[attribution]++ } } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index dc919618a84..266fb0e7079 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) memPostings := index.NewMemPostings() for i, l := range series { diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index e6cc823e502..56ebd0b1d52 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,7 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -62,7 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -106,7 +106,7 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -146,7 +146,7 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -183,7 +183,7 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index 5c6d2711212..2a6cf812aa4 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,7 +26,7 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -58,7 +58,7 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -90,7 +90,7 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", "", nil, 0) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), "foo", nil) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 77fbd3770f3..7cc2d39554b 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -50,14 +50,12 @@ type ActiveSeries struct { matchers *asmodel.Matchers lastMatchersUpdate time.Time - costAttributionLabel string - costAttributionSvc *costattribution.CostAttributionCleanupService + costAttributionSvc *costattribution.CostAttributionCleanupService // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. - timeout time.Duration - userID string - maxCostAttributionPerUser int + timeout time.Duration + userID string } // seriesStripe holds a subset of the series timestamps for a single tenant. @@ -80,7 +78,6 @@ type seriesStripe struct { activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. userID string - costAttributionLabel string // here the attribution values map, it maps the attribute value to its index, so we can increment the counter directly, // so in each entry, we keep the index of the value only, instead of keeping the string value costAttributionValues map[string]uint32 @@ -100,20 +97,16 @@ func NewActiveSeries( asm *asmodel.Matchers, timeout time.Duration, userID string, - costAttributionLabel string, costAttributionSvc *costattribution.CostAttributionCleanupService, - maxCostAttributionPerUser int, ) *ActiveSeries { c := &ActiveSeries{ matchers: asm, timeout: timeout, userID: userID, - costAttributionLabel: costAttributionLabel, - costAttributionSvc: costAttributionSvc, - maxCostAttributionPerUser: maxCostAttributionPerUser, + costAttributionSvc: costAttributionSvc, } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionLabel, costAttributionSvc) + c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionSvc) } return c @@ -130,7 +123,7 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { defer c.matchersMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionLabel, c.costAttributionSvc) + c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionSvc) } c.matchers = asm c.lastMatchersUpdate = now @@ -237,7 +230,7 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot } func (c *ActiveSeries) ActiveByAttributionValue() map[string]uint32 { - total := make(map[string]uint32, c.maxCostAttributionPerUser) + total := make(map[string]uint32, c.costAttributionSvc.GetUserAttributionLimit(c.userID)) for s := 0; s < numStripes; s++ { c.stripes[s].mu.RLock() for k, v := range c.stripes[s].costAttributionValues { @@ -433,8 +426,8 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef // here if we have a cost attribution label, we can split the serie count based on the value of the label // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly - if s.costAttributionLabel != "" { - attributionValue := s.costAttributionSvc.UpdateAttributionTimestamp(s.userID, series.Get(s.costAttributionLabel), time.Unix(0, nowNanos)) + if s.costAttributionSvc != nil && s.costAttributionSvc.GetUserAttributionLabel(s.userID) != "" { + attributionValue := s.costAttributionSvc.UpdateAttributionTimestamp(s.userID, series, time.Unix(0, nowNanos)) s.costAttributionValues[attributionValue]++ e.attributionValue = attributionValue } @@ -466,7 +459,6 @@ func (s *seriesStripe) reinitialize( asm *asmodel.Matchers, deleted *deletedSeries, userID string, - costAttributionLabel string, costAttributionSvc *costattribution.CostAttributionCleanupService, ) { s.mu.Lock() @@ -483,7 +475,6 @@ func (s *seriesStripe) reinitialize( s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) - s.costAttributionLabel = costAttributionLabel s.costAttributionSvc = costAttributionSvc } diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index 0aa2b8a09e0..d7a9f4cfb93 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -38,7 +38,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", nil) valid := c.Purge(time.Now()) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() @@ -203,7 +203,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", nil) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { @@ -230,7 +230,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(asm, DefaultTimeout, "foo", nil) testUpdateSeries(t, c) } @@ -447,7 +447,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(asm, DefaultTimeout, "foo", nil) testUpdateSeries(t, c) c.Clear() @@ -488,7 +488,7 @@ func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", nil) c.UpdateSeries(ls1, ref1, time.Now(), -1) c.UpdateSeries(ls2, ref2, time.Now(), -1) @@ -516,7 +516,7 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", nil) for i := 0; i < len(series); i++ { c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) @@ -562,7 +562,7 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute, "foo", "", nil, 0) + c := NewActiveSeries(asm, 5*time.Minute, "foo", nil) exp := len(series) - ttl expMatchingSeries := 0 @@ -595,7 +595,7 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, "foo", "", nil, 0) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, "foo", nil) c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) c.UpdateSeries(ls2, ref2, currentTime, -1) @@ -631,7 +631,7 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(asm, DefaultTimeout, "foo", nil) valid := c.Purge(currentTime) assert.True(t, valid) @@ -697,7 +697,7 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(asm, DefaultTimeout, "foo", nil) valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -736,7 +736,7 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(asm, DefaultTimeout, "foo", nil) valid := c.Purge(currentTime) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -789,7 +789,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0, "foo", "", nil, 0) + c = NewActiveSeries(&asmodel.Matchers{}, 0, "foo", nil) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -927,7 +927,7 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(asm, DefaultTimeout, "foo", nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) @@ -952,7 +952,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", "", nil, 0) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, "foo", nil) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index e697c6f1060..008cba2ca84 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1299,15 +1299,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre costLabel := i.limits.CostAttributionLabel(userID) handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { if costLabel != "" { - costAttrib := "" - for _, label := range labels { - if label.Name == costLabel { - costAttrib = label.Value - } - } // get the label value and update the timestamp, // if the cordianlity is reached or we are currently in cooldown period, function would returned __unaccounted__ - costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend) + costAttrib := i.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(labels), startAppend) stats.failedSamplesAttribution[costAttrib]++ } @@ -1418,12 +1412,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var costAttrib string // when cost attribution label is set if costLabel != "" { - for _, label := range ts.Labels { - if label.Name == costLabel { - costAttrib = label.Value - } - } - costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, costAttrib, startAppend) + costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), startAppend) } // The labels must be sorted (in our case, it's guaranteed a write request @@ -2680,9 +2669,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, userID, - i.limits.CostAttributionLabel(userID), i.costAttributionSvc, - i.limits.MaxCostAttributionPerUser(userID), ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 1443cb22705..3efd678b1a8 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -145,7 +145,8 @@ type Config struct { ContinuousTest continuoustest.Config `yaml:"-"` OverridesExporter exporter.Config `yaml:"overrides_exporter"` - Common CommonConfig `yaml:"common"` + Common CommonConfig `yaml:"common"` + CustomRegistryPath string `yaml:"custom_registry_path" category:"advanced"` TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` } diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 35f0a8040f0..8da436becf1 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -66,6 +66,7 @@ import ( "github.com/grafana/mimir/pkg/util/validation/exporter" "github.com/grafana/mimir/pkg/util/version" "github.com/grafana/mimir/pkg/vault" + "github.com/prometheus/client_golang/prometheus/promhttp" ) // The various modules that make up Mimir. @@ -473,10 +474,6 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.ActiveGroupsCleanup.Register(t.Distributor) } - if t.CostAttributionCleanup != nil { - t.CostAttributionCleanup.Register(t.Distributor) - } - return t.Distributor, nil } @@ -651,7 +648,16 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { } func (t *Mimir) initCostAttributionService() (services.Service, error) { - t.CostAttributionCleanup = costattribution.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides) + if t.Cfg.CustomRegistryPath != "" { + customRegistry := prometheus.NewRegistry() + // Register the custom registry with the provided URL. + // This allows users to expose custom metrics on a separate endpoint. + // This is useful when users want to expose metrics that are not part of the default Mimir metrics. + http.Handle(t.Cfg.CustomRegistryPath, promhttp.HandlerFor(customRegistry, promhttp.HandlerOpts{Registry: customRegistry})) + t.CostAttributionCleanup = costattribution.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, customRegistry) + return t.CostAttributionCleanup, nil + } + t.CostAttributionCleanup = costattribution.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, t.Registerer) return t.CostAttributionCleanup, nil } @@ -675,9 +681,6 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.ActiveGroupsCleanup.Register(t.Ingester) } - if t.CostAttributionCleanup != nil { - t.CostAttributionCleanup.Register(t.Ingester) - } return t.Ingester, nil } diff --git a/pkg/util/costattribution/cost_attribution.go b/pkg/util/costattribution/cost_attribution.go index 6af2f39fefd..d972fdc7af2 100644 --- a/pkg/util/costattribution/cost_attribution.go +++ b/pkg/util/costattribution/cost_attribution.go @@ -12,77 +12,119 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/services" "github.com/grafana/mimir/pkg/util/validation" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/prometheus/model/labels" "go.uber.org/atomic" ) +type Tracker struct { + trackedLabel string + activeSeriesPerUserAttribution *prometheus.GaugeVec + receivedSamplesAttribution *prometheus.CounterVec + discardedSampleAttribution *prometheus.CounterVec + attributionTimestamps map[string]*atomic.Int64 + coolDownDeadline *atomic.Int64 +} + +func (m *Tracker) RemoveAttributionMetricsForUser(userID, attribution string) { + m.activeSeriesPerUserAttribution.DeleteLabelValues(userID, attribution) + m.receivedSamplesAttribution.DeleteLabelValues(userID, attribution) + m.discardedSampleAttribution.DeleteLabelValues(userID, attribution) +} + +func NewCostAttributionTracker(reg prometheus.Registerer, trackedLabel string) *Tracker { + m := &Tracker{ + trackedLabel: trackedLabel, + attributionTimestamps: map[string]*atomic.Int64{}, + coolDownDeadline: atomic.NewInt64(0), + discardedSampleAttribution: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_discarded_samples_attribution_total", + Help: "The total number of samples that were discarded per attribution.", + }, []string{"user", trackedLabel}), + receivedSamplesAttribution: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_received_samples_attribution_total", + Help: "The total number of samples that were received per attribution.", + }, []string{"user", trackedLabel}), + activeSeriesPerUserAttribution: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_ingester_active_series_attribution", + Help: "The total number of active series per user and attribution.", + }, []string{"user", trackedLabel}), + } + return m +} + type CostAttribution struct { - mu sync.RWMutex - timestampsPerUser map[string]map[string]*atomic.Int64 // map[user][group] -> timestamp - coolDownDeadline map[string]*atomic.Int64 - limits *validation.Overrides + mu sync.RWMutex + trackers map[string]*Tracker + limits *validation.Overrides + reg prometheus.Registerer } -func NewCostAttribution(limits *validation.Overrides) *CostAttribution { +func NewCostAttribution(limits *validation.Overrides, reg prometheus.Registerer) *CostAttribution { return &CostAttribution{ - timestampsPerUser: map[string]map[string]*atomic.Int64{}, - coolDownDeadline: map[string]*atomic.Int64{}, - limits: limits, + trackers: make(map[string]*Tracker), + limits: limits, + reg: reg, + mu: sync.RWMutex{}, } } // UpdateAttributionTimestampForUser function is only guaranteed to update to the // timestamp provided even if it is smaller than the existing value -func (ag *CostAttribution) UpdateAttributionTimestampForUser(userID, attribution string, now time.Time) { - ts := now.UnixNano() - ag.mu.RLock() - if groupTs := ag.timestampsPerUser[userID][attribution]; groupTs != nil { - ag.mu.RUnlock() - groupTs.Store(ts) +func (ca *CostAttribution) UpdateAttributionTimestampForUser(userID, attribution string, now time.Time) { + // If the limit is set to 0, we don't need to track the attribution + if ca.limits.MaxCostAttributionPerUser(userID) <= 0 { return } - ag.mu.RUnlock() - ag.mu.Lock() - defer ag.mu.Unlock() - - if ag.timestampsPerUser[userID] == nil { - ag.timestampsPerUser[userID] = map[string]*atomic.Int64{attribution: atomic.NewInt64(ts)} - return + ts := now.UnixNano() + ca.mu.Lock() + // create new tracker if not exists + if _, exists := ca.trackers[userID]; !exists { + // the attribution label and values should be managed by cache + ca.trackers[userID] = NewCostAttributionTracker(ca.reg, ca.limits.CostAttributionLabel(userID)) } - - if groupTs := ag.timestampsPerUser[userID][attribution]; groupTs != nil { + ca.mu.Unlock() + ca.mu.RLock() + if groupTs := ca.trackers[userID].attributionTimestamps[attribution]; groupTs != nil { groupTs.Store(ts) return } - - ag.timestampsPerUser[userID][attribution] = atomic.NewInt64(ts) + ca.mu.RUnlock() + ca.mu.Lock() + defer ca.mu.Unlock() + ca.trackers[userID].attributionTimestamps[attribution] = atomic.NewInt64(ts) } -func (ag *CostAttribution) purgeInactiveAttributionsForUser(userID string, deadline int64) []string { - ag.mu.RLock() +func (ca *CostAttribution) purgeInactiveAttributionsForUser(userID string, deadline int64) []string { + ca.mu.RLock() var inactiveAttributions []string - attributionTimestamps := ag.timestampsPerUser[userID] + if ca.trackers[userID] == nil || ca.trackers[userID].attributionTimestamps == nil { + return nil + } + attributionTimestamps := ca.trackers[userID].attributionTimestamps for attr, ts := range attributionTimestamps { if ts.Load() <= deadline { inactiveAttributions = append(inactiveAttributions, attr) } } - ag.mu.RUnlock() + ca.mu.RUnlock() if len(inactiveAttributions) == 0 { return nil } // Cleanup inactive groups - ag.mu.Lock() - defer ag.mu.Unlock() + ca.mu.Lock() + defer ca.mu.Unlock() for i := 0; i < len(inactiveAttributions); { inactiveAttribution := inactiveAttributions[i] - groupTs := ag.timestampsPerUser[userID][inactiveAttribution] + groupTs := ca.trackers[userID].attributionTimestamps[inactiveAttribution] if groupTs != nil && groupTs.Load() <= deadline { - delete(ag.timestampsPerUser[userID], inactiveAttribution) + delete(ca.trackers[userID].attributionTimestamps, inactiveAttribution) i++ } else { inactiveAttributions[i] = inactiveAttributions[len(inactiveAttributions)-1] @@ -93,10 +135,10 @@ func (ag *CostAttribution) purgeInactiveAttributionsForUser(userID string, deadl return inactiveAttributions } -func (ca *CostAttribution) purgeInactiveAttributions(inactiveTimeout time.Duration, cleanupFuncs ...func(string, string)) { +func (ca *CostAttribution) purgeInactiveAttributions(inactiveTimeout time.Duration) { ca.mu.RLock() - userIDs := make([]string, 0, len(ca.timestampsPerUser)) - for userID := range ca.timestampsPerUser { + userIDs := make([]string, 0, len(ca.trackers)) + for userID := range ca.trackers { userIDs = append(userIDs, userID) } ca.mu.RUnlock() @@ -105,32 +147,37 @@ func (ca *CostAttribution) purgeInactiveAttributions(inactiveTimeout time.Durati for _, userID := range userIDs { inactiveAttributions := ca.purgeInactiveAttributionsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano()) for _, attribution := range inactiveAttributions { - for _, cleanupFn := range cleanupFuncs { - cleanupFn(userID, attribution) - } + ca.trackers[userID].RemoveAttributionMetricsForUser(userID, attribution) } } } -func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string, now time.Time) bool { +func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string) bool { // if we are still at the cooldown period, we will consider the limit reached ca.mu.RLock() defer ca.mu.RUnlock() + // if the user is not exist, we don't need to check the limit + if ca.trackers[userID] == nil { + return false + } - if v, exists := ca.coolDownDeadline[userID]; exists && v.Load() > now.UnixNano() { + now := time.Now() + if v := ca.trackers[userID].coolDownDeadline; v != nil && v.Load() > now.UnixNano() { return true } // if the user attribution is already exist and we are not in the cooldown period, we don't need to check the limit - _, exists := ca.timestampsPerUser[userID][attribution] + _, exists := ca.trackers[userID].attributionTimestamps[attribution] if exists { return false } // if the user has reached the limit, we will set the cooldown period which is 20 minutes - maxReached := len(ca.timestampsPerUser[userID]) >= ca.limits.MaxCostAttributionPerUser(userID) + maxReached := len(ca.trackers[userID].attributionTimestamps) >= ca.limits.MaxCostAttributionPerUser(userID) if maxReached { - ca.coolDownDeadline[userID].Store(time.Now().Add(20 * time.Minute).UnixNano()) + ca.mu.Lock() + ca.trackers[userID].coolDownDeadline.Store(now.Add(20 * time.Minute).UnixNano()) + ca.mu.Unlock() return true } @@ -141,7 +188,6 @@ type CostAttributionCleanupService struct { services.Service logger log.Logger costAttribution *CostAttribution - cleanupFuncs []func(userID, attribution string) inactiveTimeout time.Duration invalidValue string } @@ -150,10 +196,9 @@ type CostAttributionMetricsCleaner interface { RemoveAttributionMetricsForUser(userID, attribution string) } -func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, cleanupFns ...func(string, string)) *CostAttributionCleanupService { +func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg prometheus.Registerer) *CostAttributionCleanupService { s := &CostAttributionCleanupService{ - costAttribution: NewCostAttribution(limits), - cleanupFuncs: cleanupFns, + costAttribution: NewCostAttribution(limits, reg), inactiveTimeout: inactiveTimeout, logger: logger, invalidValue: "__unaccounted__", @@ -163,12 +208,74 @@ func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Dura return s } -func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user, attribution string, now time.Time) string { +// IncrementReceivedSamples increments the received samples counter for a given user and attribution +func (s *CostAttributionCleanupService) IncrementReceivedSamples(userID, attribution string, value float64) { + attribution = s.GetUserAttribution(userID, attribution) + s.costAttribution.mu.RLock() + defer s.costAttribution.mu.RUnlock() + if tracker, exists := s.costAttribution.trackers[userID]; exists { + tracker.receivedSamplesAttribution.WithLabelValues(userID, attribution).Add(value) + } +} + +// IncrementDiscardedSamples increments the discarded samples counter for a given user and attribution +func (s *CostAttributionCleanupService) IncrementDiscardedSamples(userID, attribution string, value float64) { + attribution = s.GetUserAttribution(userID, attribution) + s.costAttribution.mu.RLock() + defer s.costAttribution.mu.RUnlock() + if tracker, exists := s.costAttribution.trackers[userID]; exists { + tracker.discardedSampleAttribution.WithLabelValues(userID, attribution).Add(value) + } +} + +// SetActiveSeries sets the active series gauge for a given user and attribution +func (s *CostAttributionCleanupService) SetActiveSeries(userID, attribution string, value float64) { + attribution = s.GetUserAttribution(userID, attribution) + s.costAttribution.mu.RLock() + defer s.costAttribution.mu.RUnlock() + if tracker, exists := s.costAttribution.trackers[userID]; exists { + tracker.activeSeriesPerUserAttribution.WithLabelValues(userID, attribution).Set(value) + } +} + +func (s *CostAttributionCleanupService) GetUserAttribution(userID, attribution string) string { + // not tracking cost attribution for this user, this shouldn't happen + if s.costAttribution.limits.MaxCostAttributionPerUser(userID) <= 0 { + return attribution + } + if s.costAttribution.attributionLimitExceeded(userID, attribution) { + return s.invalidValue + } + return attribution +} + +func (s *CostAttributionCleanupService) GetUserAttributionLabel(userID string) string { + s.costAttribution.mu.RLock() + defer s.costAttribution.mu.RUnlock() + if s.costAttribution != nil { + if val, exists := s.costAttribution.trackers[userID]; exists { + return val.trackedLabel + } + } + return "" +} + +func (s *CostAttributionCleanupService) GetUserAttributionLimit(userID string) int { + return s.costAttribution.limits.MaxCostAttributionPerUser(userID) +} + +func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user string, lbs labels.Labels, now time.Time) string { + if s.costAttribution.trackers[user] == nil || s.costAttribution.trackers[user].trackedLabel == "" { + return "" + } + attribution := lbs.Get(s.costAttribution.trackers[user].trackedLabel) // empty label is not normal, if user set attribution label, the metrics send has to include the label if attribution == "" { - attribution = s.invalidValue - level.Error(s.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since missing cost attribution label in metrics", s.invalidValue)) - } else if s.costAttribution.attributionLimitExceeded(user, attribution, now) { + level.Error(s.logger).Log("msg", "set attribution label to \"\" since missing cost attribution label in metrics") + return attribution + } + + if s.costAttribution.attributionLimitExceeded(user, attribution) { attribution = s.invalidValue level.Error(s.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since user has reached the limit of cost attribution labels", s.invalidValue)) } @@ -178,12 +285,6 @@ func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user, attribu } func (s *CostAttributionCleanupService) iteration(_ context.Context) error { - s.costAttribution.purgeInactiveAttributions(s.inactiveTimeout, s.cleanupFuncs...) + s.costAttribution.purgeInactiveAttributions(s.inactiveTimeout) return nil } - -// Register registers the cleanup function from metricsCleaner to be called during each cleanup iteration. -// This function is NOT thread safe -func (s *CostAttributionCleanupService) Register(metricsCleaner CostAttributionMetricsCleaner) { - s.cleanupFuncs = append(s.cleanupFuncs, metricsCleaner.RemoveAttributionMetricsForUser) -} From 046046f098a036d416c2c99835d43483131da483 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 3 Oct 2024 23:59:56 +0200 Subject: [PATCH 06/13] remove changes to metrics and adding extra --- pkg/distributor/distributor.go | 27 +++-- .../distributor_ingest_storage_test.go | 2 +- pkg/distributor/distributor_test.go | 12 +-- pkg/ingester/ingester.go | 18 ++-- pkg/ingester/ingester_ingest_storage_test.go | 2 +- pkg/ingester/ingester_test.go | 98 +++++++++---------- pkg/ingester/metrics.go | 13 +-- pkg/util/costattribution/cost_attribution.go | 4 + 8 files changed, 82 insertions(+), 94 deletions(-) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index c14b5a35306..d127bfe00d1 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -357,7 +357,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove receivedSamples: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_distributor_received_samples_total", Help: "The total number of received samples, excluding rejected and deduped samples.", - }, []string{"user", "attrib"}), + }, []string{"user"}), receivedExemplars: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_distributor_received_exemplars_total", Help: "The total number of received exemplars, excluding rejected and deduped exemplars.", @@ -645,6 +645,7 @@ func (d *Distributor) cleanupInactiveUser(userID string) { d.HATracker.cleanupHATrackerMetricsForUser(userID) d.receivedRequests.DeleteLabelValues(userID) + d.receivedSamples.DeleteLabelValues(userID) d.receivedExemplars.DeleteLabelValues(userID) d.receivedMetadata.DeleteLabelValues(userID) d.incomingRequests.DeleteLabelValues(userID) @@ -661,7 +662,6 @@ func (d *Distributor) cleanupInactiveUser(userID string) { filter := prometheus.Labels{"user": userID} d.dedupedSamples.DeletePartialMatch(filter) - d.receivedSamples.DeletePartialMatch(filter) d.discardedSamplesTooManyHaClusters.DeletePartialMatch(filter) d.discardedSamplesRateLimited.DeletePartialMatch(filter) d.discardedRequestsRateLimited.DeleteLabelValues(userID) @@ -680,11 +680,6 @@ func (d *Distributor) RemoveGroupMetricsForUser(userID, group string) { d.sampleValidationMetrics.deleteUserMetricsForGroup(userID, group) } -func (d *Distributor) RemoveAttributionMetricsForUser(userID, attribution string) { - d.receivedSamples.DeleteLabelValues(userID, attribution) - //TODO @ying: Remove attribution metrics -} - // Called after distributor is asked to stop via StopAsync. func (d *Distributor) stopping(_ error) error { return services.StopManagerAndAwaitStopped(context.Background(), d.subservices) @@ -1441,7 +1436,7 @@ func (d *Distributor) push(ctx context.Context, pushReq *Request) error { now := mtime.Now() - d.updateReceivedMetrics(req, userID, d.limits.CostAttributionLabel(userID), now) + d.updateReceivedMetrics(req, userID, now) if len(req.Timeseries) == 0 && len(req.Metadata) == 0 { return nil @@ -1672,28 +1667,30 @@ func tokenForMetadata(userID string, metricName string) uint32 { return mimirpb.ShardByMetricName(userID, metricName) } -func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string, costAttributionLabel string, now time.Time) { +func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string, now time.Time) { var receivedSamples, receivedExemplars, receivedMetadata int costAttributionSize := 0 - if costAttributionLabel != "" { - costAttributionSize = d.limits.MaxCostAttributionPerUser(userID) + caEnabled := d.costAttributionSvc != nil && d.costAttributionSvc.EnabledForUser(userID) + if caEnabled { + costAttributionSize = d.costAttributionSvc.GetUserAttributionLimit(userID) } costAttribution := make(map[string]int, costAttributionSize) + for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - if costAttributionLabel != "" { + if caEnabled { attribution := d.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), now) costAttribution[attribution]++ } } receivedMetadata = len(req.Metadata) - if costAttributionLabel != "" { + if caEnabled { for lv, count := range costAttribution { - d.receivedSamples.WithLabelValues(userID, lv).Add(float64(count)) + d.costAttributionSvc.IncrementReceivedSamples(userID, lv, float64(count)) } } else { - d.receivedSamples.WithLabelValues(userID, "").Add(float64(receivedSamples)) + d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) } d.receivedExemplars.WithLabelValues(userID).Add(float64(receivedExemplars)) d.receivedMetadata.WithLabelValues(userID).Add(float64(receivedMetadata)) diff --git a/pkg/distributor/distributor_ingest_storage_test.go b/pkg/distributor/distributor_ingest_storage_test.go index 091d6f0225d..77b22cc9967 100644 --- a/pkg/distributor/distributor_ingest_storage_test.go +++ b/pkg/distributor/distributor_ingest_storage_test.go @@ -209,7 +209,7 @@ func TestDistributor_Push_ShouldSupportIngestStorage(t *testing.T) { # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{attrib="", user="user"} 5 + cortex_distributor_received_samples_total{ user="user"} 5 # HELP cortex_distributor_metadata_in_total The total number of metadata the have come in to the distributor, including rejected. # TYPE cortex_distributor_metadata_in_total counter diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index a1ce70b17fb..a7159d91154 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -363,8 +363,8 @@ func TestDistributor_MetricsCleanup(t *testing.T) { "cortex_distributor_label_values_with_newlines_total", } - d.receivedSamples.WithLabelValues("userA", "").Add(5) - d.receivedSamples.WithLabelValues("userB", "").Add(10) + d.receivedSamples.WithLabelValues("userA").Add(5) + d.receivedSamples.WithLabelValues("userB").Add(10) d.receivedExemplars.WithLabelValues("userA").Add(5) d.receivedExemplars.WithLabelValues("userB").Add(10) d.receivedMetadata.WithLabelValues("userA").Add(5) @@ -401,8 +401,8 @@ func TestDistributor_MetricsCleanup(t *testing.T) { # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{attrib="",user="userA"} 5 - cortex_distributor_received_samples_total{attrib="",user="userB"} 10 + cortex_distributor_received_samples_total{user="userA"} 5 + cortex_distributor_received_samples_total{user="userB"} 10 # HELP cortex_distributor_received_exemplars_total The total number of received exemplars, excluding rejected and deduped exemplars. # TYPE cortex_distributor_received_exemplars_total counter @@ -443,7 +443,7 @@ func TestDistributor_MetricsCleanup(t *testing.T) { # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{attrib="",user="userB"} 10 + cortex_distributor_received_samples_total{user="userB"} 10 # HELP cortex_distributor_received_exemplars_total The total number of received exemplars, excluding rejected and deduped exemplars. # TYPE cortex_distributor_received_exemplars_total counter @@ -6852,7 +6852,7 @@ func TestDistributor_MetricsWithRequestModifications(t *testing.T) { cortex_distributor_received_requests_total{user="%s"} %d # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{attrib="",user="%s"} %d + cortex_distributor_received_samples_total{user="%s"} %d # HELP cortex_distributor_received_exemplars_total The total number of received exemplars, excluding rejected and deduped exemplars. # TYPE cortex_distributor_received_exemplars_total counter cortex_distributor_received_exemplars_total{user="%s"} %d diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 008cba2ca84..43bd42d4f0a 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -788,14 +788,14 @@ func (i *Ingester) updateActiveSeries(now time.Time) { allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := userDB.activeSeries.ActiveWithMatchers() i.metrics.activeSeriesLoading.DeleteLabelValues(userID) if allActive > 0 { - costAttribLabel := i.limits.CostAttributionLabel(userID) - if costAttribLabel != "" { + caEnabled := i.costAttributionSvc != nil && i.costAttributionSvc.EnabledForUser(userID) + if caEnabled { labelAttributions := userDB.activeSeries.ActiveByAttributionValue() for label, count := range labelAttributions { - i.metrics.activeSeriesPerUser.WithLabelValues(userID, label).Set(float64(count)) + i.costAttributionSvc.SetActiveSeries(userID, label, float64(count)) } } else { - i.metrics.activeSeriesPerUser.WithLabelValues(userID, "").Set(float64(allActive)) + i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(allActive)) } } else { i.metrics.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) @@ -1283,8 +1283,10 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats db.ingestedAPISamples.Add(int64(stats.succeededSamplesCount)) } } - for label, count := range stats.failedSamplesAttribution { - discarded.samplesPerAttribution.WithLabelValues(userID, label).Add(float64(count)) + if i.costAttributionSvc != nil && i.costAttributionSvc.EnabledForUser(userID) { + for label, count := range stats.failedSamplesAttribution { + i.costAttributionSvc.IncrementDiscardedSamples(userID, label, float64(count)) + } } } @@ -3429,10 +3431,6 @@ func (i *Ingester) RemoveGroupMetricsForUser(userID, group string) { i.metrics.deletePerGroupMetricsForUser(userID, group) } -func (i *Ingester) RemoveAttributionMetricsForUser(userID, attribution string) { - i.metrics.deletePerAttributionMetricsForUser(userID, attribution) -} - // TransferOut implements ring.FlushTransferer. func (i *Ingester) TransferOut(_ context.Context) error { return ring.ErrTransferDisabled diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 0b566b03c0b..fcf79dd4bc7 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -193,7 +193,7 @@ func TestIngester_Start(t *testing.T) { return testutil.GatherAndCompare(reg, strings.NewReader(fmt.Sprintf(` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="%s"} 1 + cortex_ingester_active_series{user="%s"} 1 # HELP cortex_ingester_owned_series Number of currently owned series per user. # TYPE cortex_ingester_owned_series gauge diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 1581ac9d05b..bd307443840 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -455,7 +455,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 0.009 @@ -526,7 +526,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_active_native_histogram_series Number of currently active native histogram series per user. # TYPE cortex_ingester_active_native_histogram_series gauge cortex_ingester_active_native_histogram_series{user="test"} 1 @@ -606,7 +606,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -716,7 +716,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -818,7 +818,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -944,7 +944,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1071,7 +1071,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1179,7 +1179,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1300,7 +1300,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1416,7 +1416,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1544,7 +1544,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -1712,7 +1712,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -2060,7 +2060,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -2204,7 +2204,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_active_native_histogram_series{user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_exemplar_exemplars_appended_total Total number of TSDB exemplars appended. # TYPE cortex_ingester_tsdb_exemplar_exemplars_appended_total counter @@ -2335,7 +2335,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-order",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 @@ -2393,7 +2393,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-bounds",user="test"} 2 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2452,7 +2452,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-bounds",user="test"} 3 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2509,7 +2509,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2571,7 +2571,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-out-of-bounds",user="test"} 2 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2633,7 +2633,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-too-far-in-future",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds ` + fmt.Sprintf("%g", float64(now.UnixMilli())/1000) + ` @@ -2688,7 +2688,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="sample-too-far-in-future",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_active_native_histogram_buckets Number of currently active native histogram buckets per user. # TYPE cortex_ingester_active_native_histogram_buckets gauge cortex_ingester_active_native_histogram_buckets{user="test"} 8 @@ -2764,7 +2764,7 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_memory_series_removed_total{user="test"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested. # TYPE cortex_ingester_ingested_exemplars_total counter cortex_ingester_ingested_exemplars_total 1 @@ -2824,7 +2824,7 @@ func TestIngester_Push(t *testing.T) { cortex_discarded_samples_total{group="",reason="new-value-for-timestamp",user="test"} 1 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge cortex_ingester_tsdb_head_min_timestamp_seconds 1575043.969 @@ -2969,7 +2969,7 @@ func TestIngester_Push(t *testing.T) { expectedMetrics: ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test"} 1 + cortex_ingester_active_series{user="test"} 1 # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. # TYPE cortex_ingester_ingested_samples_total counter cortex_ingester_ingested_samples_total{user="test"} 1 @@ -3406,8 +3406,8 @@ func TestIngester_Push_ShouldCorrectlyTrackMetricsInMultiTenantScenario(t *testi cortex_ingester_memory_series_removed_total{user="test-2"} 0 # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test-1"} 2 - cortex_ingester_active_series{attrib="",user="test-2"} 2 + cortex_ingester_active_series{user="test-1"} 2 + cortex_ingester_active_series{user="test-2"} 2 # HELP cortex_ingester_active_native_histogram_series Number of currently active native histogram series per user. # TYPE cortex_ingester_active_native_histogram_series gauge cortex_ingester_active_native_histogram_series{user="test-1"} 1 @@ -7214,7 +7214,7 @@ func TestIngesterCompactAndCloseIdleTSDB(t *testing.T) { # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="1"} 1 + cortex_ingester_active_series{user="1"} 1 # HELP cortex_ingester_memory_metadata The current number of metadata in memory. # TYPE cortex_ingester_memory_metadata gauge @@ -7276,7 +7276,7 @@ func TestIngesterCompactAndCloseIdleTSDB(t *testing.T) { # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="1"} 1 + cortex_ingester_active_series{user="1"} 1 # HELP cortex_ingester_memory_metadata The current number of metadata in memory. # TYPE cortex_ingester_memory_metadata gauge @@ -8877,8 +8877,8 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -8924,8 +8924,8 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -8976,8 +8976,8 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -9023,7 +9023,7 @@ func TestIngesterActiveSeries(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -9243,8 +9243,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="other_test_user"} 4 @@ -9291,7 +9291,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="other_test_user"} 4 @@ -9328,8 +9328,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 @@ -9377,8 +9377,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 @@ -9420,7 +9420,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 @@ -9457,8 +9457,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="other_test_user"} 4 @@ -9503,7 +9503,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_false_flagbased",user="test_user"} 4 @@ -9558,7 +9558,7 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="team_a",user="test_user"} 4 @@ -9605,8 +9605,8 @@ func TestIngesterActiveSeriesConfigChanges(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_active_series Number of currently active series per user. # TYPE cortex_ingester_active_series gauge - cortex_ingester_active_series{attrib="",user="other_test_user"} 8 - cortex_ingester_active_series{attrib="",user="test_user"} 8 + cortex_ingester_active_series{user="other_test_user"} 8 + cortex_ingester_active_series{user="test_user"} 8 # HELP cortex_ingester_active_series_custom_tracker Number of currently active series matching a pre-configured label matchers per user. # TYPE cortex_ingester_active_series_custom_tracker gauge cortex_ingester_active_series_custom_tracker{name="bool_is_true_flagbased",user="other_test_user"} 4 diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 54adc381c70..e960cf8e206 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -303,7 +303,7 @@ func newIngesterMetrics( activeSeriesPerUser: promauto.With(activeSeriesReg).NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_ingester_active_series", Help: "Number of currently active series per user.", - }, []string{"user", "attrib"}), + }, []string{"user"}), // Not registered automatically, but only if activeSeriesEnabled is true. activeSeriesCustomTrackersPerUser: promauto.With(activeSeriesReg).NewGaugeVec(prometheus.GaugeOpts{ @@ -407,11 +407,6 @@ func (m *ingesterMetrics) deletePerGroupMetricsForUser(userID, group string) { m.discarded.DeleteLabelValues(userID, group) } -func (m *ingesterMetrics) deletePerAttributionMetricsForUser(userID, attribution string) { - m.activeSeriesPerUser.DeleteLabelValues(userID, attribution) - m.discarded.samplesPerAttribution.DeleteLabelValues(userID, attribution) -} - func (m *ingesterMetrics) deletePerUserCustomTrackerMetrics(userID string, customTrackerMetrics []string) { m.activeSeriesLoading.DeleteLabelValues(userID) m.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) @@ -434,7 +429,6 @@ type discardedMetrics struct { perUserSeriesLimit *prometheus.CounterVec perMetricSeriesLimit *prometheus.CounterVec invalidNativeHistogram *prometheus.CounterVec - samplesPerAttribution *prometheus.CounterVec } func newDiscardedMetrics(r prometheus.Registerer) *discardedMetrics { @@ -447,10 +441,6 @@ func newDiscardedMetrics(r prometheus.Registerer) *discardedMetrics { perUserSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerUserSeriesLimit), perMetricSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerMetricSeriesLimit), invalidNativeHistogram: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogram), - samplesPerAttribution: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_discarded_samples_attribution_total", - Help: "The total number of samples that were discarded per attribution.", - }, []string{"user", "attrib"}), } } @@ -463,7 +453,6 @@ func (m *discardedMetrics) DeletePartialMatch(filter prometheus.Labels) { m.perUserSeriesLimit.DeletePartialMatch(filter) m.perMetricSeriesLimit.DeletePartialMatch(filter) m.invalidNativeHistogram.DeletePartialMatch(filter) - m.samplesPerAttribution.DeletePartialMatch(filter) } func (m *discardedMetrics) DeleteLabelValues(userID string, group string) { diff --git a/pkg/util/costattribution/cost_attribution.go b/pkg/util/costattribution/cost_attribution.go index d972fdc7af2..ab47346a35d 100644 --- a/pkg/util/costattribution/cost_attribution.go +++ b/pkg/util/costattribution/cost_attribution.go @@ -260,6 +260,10 @@ func (s *CostAttributionCleanupService) GetUserAttributionLabel(userID string) s return "" } +func (s *CostAttributionCleanupService) EnabledForUser(userID string) bool { + return s.costAttribution.limits.CostAttributionLabel(userID) != "" +} + func (s *CostAttributionCleanupService) GetUserAttributionLimit(userID string) int { return s.costAttribution.limits.MaxCostAttributionPerUser(userID) } From 745d2c11ad5d1df8f23c2ad414ecafa85903d4a0 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Sat, 5 Oct 2024 21:25:59 +0200 Subject: [PATCH 07/13] Change to use unchecked expoter --- cmd/mimir/config-descriptor.json | 80 +++-- cmd/mimir/help-all.txt.tmpl | 6 +- pkg/costattribution/caimpl/managerImpl.go | 143 +++++++++ .../caimpl/managerImpl_test.go | 194 ++++++++++++ pkg/costattribution/caimpl/tracker.go | 64 ++++ pkg/costattribution/caimpl/tracker_group.go | 200 ++++++++++++ .../caimpl/tracker_group_test.go | 97 ++++++ pkg/costattribution/caimpl/tracker_test.go | 55 ++++ pkg/costattribution/manager.go | 24 ++ pkg/distributor/distributor.go | 23 +- .../distributor_ingest_storage_test.go | 2 +- pkg/distributor/distributor_test.go | 1 + pkg/ingester/activeseries/active_series.go | 24 +- pkg/ingester/ingester.go | 40 +-- pkg/mimir/mimir.go | 14 +- pkg/mimir/modules.go | 16 +- pkg/util/costattribution/cost_attribution.go | 294 ------------------ pkg/util/validation/limits.go | 8 +- 18 files changed, 894 insertions(+), 391 deletions(-) create mode 100644 pkg/costattribution/caimpl/managerImpl.go create mode 100644 pkg/costattribution/caimpl/managerImpl_test.go create mode 100644 pkg/costattribution/caimpl/tracker.go create mode 100644 pkg/costattribution/caimpl/tracker_group.go create mode 100644 pkg/costattribution/caimpl/tracker_group_test.go create mode 100644 pkg/costattribution/caimpl/tracker_test.go create mode 100644 pkg/costattribution/manager.go delete mode 100644 pkg/util/costattribution/cost_attribution.go diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 79bd4bb05af..ae8d6f78b1a 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -57,17 +57,6 @@ "fieldType": "int", "fieldCategory": "experimental" }, - { - "kind": "field", - "name": "cost_attribution_eviction_interval", - "required": false, - "desc": "Interval at which to evict inactive cost attributions.", - "fieldValue": null, - "fieldDefaultValue": 600000000000, - "fieldFlag": "cost-attribution-eviction-interval", - "fieldType": "duration", - "fieldCategory": "experimental" - }, { "kind": "field", "name": "enable_go_runtime_metrics", @@ -3995,28 +3984,6 @@ "fieldType": "string", "fieldCategory": "experimental" }, - { - "kind": "field", - "name": "cost_attribution_label", - "required": false, - "desc": "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total.", - "fieldValue": null, - "fieldDefaultValue": "", - "fieldFlag": "validation.cost-attribution-label", - "fieldType": "string", - "fieldCategory": "experimental" - }, - { - "kind": "field", - "name": "max_cost_attribution_per_user", - "required": false, - "desc": "The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution.", - "fieldValue": null, - "fieldDefaultValue": 0, - "fieldFlag": "validation.max-cost-attribution-per-user", - "fieldType": "int", - "fieldCategory": "experimental" - }, { "kind": "field", "name": "max_fetched_chunks_per_query", @@ -4325,6 +4292,28 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_label", + "required": false, + "desc": "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "validation.cost-attribution-label", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_per_user", + "required": false, + "desc": "The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.max-cost-attribution-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "ruler_evaluation_delay_duration", @@ -18165,9 +18154,10 @@ "kind": "field", "name": "custom_registry_path", "required": false, - "desc": "", + "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.", "fieldValue": null, "fieldDefaultValue": "", + "fieldFlag": "custom-registry-path", "fieldType": "string", "fieldCategory": "advanced" }, @@ -18181,6 +18171,28 @@ "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled", "fieldType": "boolean", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_eviction_interval", + "required": false, + "desc": "Interval at which to evict inactive cost attributions.", + "fieldValue": null, + "fieldDefaultValue": 1800000000000, + "fieldFlag": "cost-attribution-eviction-interval", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cool_down_duration", + "required": false, + "desc": "Duration for which to keep the cost attribution active after the last update.", + "fieldValue": null, + "fieldDefaultValue": 1200000000000, + "fieldFlag": "cost-attribution-cool-down-duration", + "fieldType": "duration", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 122fa61a989..7b7e466c26d 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1133,8 +1133,12 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution-cool-down-duration duration + [experimental] Duration for which to keep the cost attribution active after the last update. (default 20m0s) -cost-attribution-eviction-interval duration - [experimental] Interval at which to evict inactive cost attributions. (default 10m0s) + [experimental] Interval at which to evict inactive cost attributions. (default 30m0s) + -custom-registry-path string + Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int diff --git a/pkg/costattribution/caimpl/managerImpl.go b/pkg/costattribution/caimpl/managerImpl.go new file mode 100644 index 00000000000..a7e905544e9 --- /dev/null +++ b/pkg/costattribution/caimpl/managerImpl.go @@ -0,0 +1,143 @@ +package caimpl + +import ( + "context" + "fmt" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/grafana/dskit/services" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" + + "github.com/grafana/mimir/pkg/util/validation" +) + +type ManagerImpl struct { + services.Service + logger log.Logger + attributionTracker *AttributionTrackerGroup + inactiveTimeout time.Duration + invalidValue string +} + +// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. +// It will clean up inactive series and update the cost attribution of series every 3 minutes. +func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *ManagerImpl { + s := &ManagerImpl{ + attributionTracker: newAttributionTrackerGroup(limits, cooldownTimeout), + inactiveTimeout: inactiveTimeout, + logger: logger, + invalidValue: "__unaccounted__", + } + + s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager") + return s +} + +func (m *ManagerImpl) iteration(_ context.Context) error { + m.attributionTracker.purgeInactiveAttributions(m.inactiveTimeout) + return nil +} + +// EnabledForUser returns true if the cost attribution is enabled for the user +func (m *ManagerImpl) EnabledForUser(userID string) bool { + return m.attributionTracker.limits.CostAttributionLabel(userID) != "" +} + +// GetUserAttributionLabel returns the cost attribution label for the user, first it will try to get the label from the cache, +// If not found, it will get the label from the config +// If the user is not enabled for cost attribution, it would clean the cache and return empty string +func (m *ManagerImpl) GetUserAttributionLabel(userID string) string { + if m.EnabledForUser(userID) { + return m.attributionTracker.getUserAttributionLabelFromCache(userID) + } + m.attributionTracker.deleteUserTracerFromCache(userID) + return "" +} + +// GetUserAttributionLimit returns the cost attribution limit for the user, first it will try to get the limit from the cache, +// If not found, it will get the limit from the config +// If the user is not enabled for cost attribution, it would clean the cache and return 0 +func (m *ManagerImpl) GetUserAttributionLimit(userID string) int { + if m.EnabledForUser(userID) { + return m.attributionTracker.getUserAttributionLimitFromCache(userID) + } + m.attributionTracker.deleteUserTracerFromCache(userID) + return 0 +} + +func (m *ManagerImpl) UpdateAttributionTimestamp(user string, lbs labels.Labels, now time.Time) string { + // if cost attribution is not enabled for the user, return empty string + if !m.EnabledForUser(user) { + m.attributionTracker.deleteUserTracerFromCache(user) + return "" + } + + // when cost attribution is enabled, the label has to be set. the cache would be updated with the label + lb := m.attributionTracker.getUserAttributionLabelFromCache(user) + // this should not happened, if user is enabled for cost attribution, the label has to be set + if lb == "" { + return "" + } + val := lbs.Get(lb) + + if m.attributionTracker.attributionLimitExceeded(user, val, now) { + val = m.invalidValue + level.Error(m.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since user has reached the limit of cost attribution labels", m.invalidValue)) + } + m.attributionTracker.updateAttributionCacheForUser(user, lb, val, now) + return val +} + +// SetActiveSeries adjust the input attribution and sets the active series gauge for the given user and attribution +func (m *ManagerImpl) SetActiveSeries(userID, attribution string, value float64) { + attribution = m.adjustUserAttribution(userID, attribution) + + m.attributionTracker.mu.Lock() + defer m.attributionTracker.mu.Unlock() + if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { + tracker.activeSeriesPerUserAttribution.WithLabelValues(userID, attribution).Set(value) + } +} + +// IncrementDiscardedSamples increments the discarded samples counter for a given user and attribution +func (m *ManagerImpl) IncrementDiscardedSamples(userID, attribution string, value float64) { + attribution = m.adjustUserAttribution(userID, attribution) + m.attributionTracker.mu.RLock() + defer m.attributionTracker.mu.RUnlock() + if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { + tracker.discardedSampleAttribution.WithLabelValues(userID, attribution).Add(value) + } +} + +// IncrementReceivedSamples increments the received samples counter for a given user and attribution +func (m *ManagerImpl) IncrementReceivedSamples(userID, attribution string, value float64) { + attribution = m.adjustUserAttribution(userID, attribution) + m.attributionTracker.mu.RLock() + defer m.attributionTracker.mu.RUnlock() + if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { + tracker.receivedSamplesAttribution.WithLabelValues(userID, attribution).Add(value) + } +} + +func (m *ManagerImpl) adjustUserAttribution(userID, attribution string) string { + if m.attributionTracker.attributionLimitExceeded(userID, attribution, time.Now()) { + return m.invalidValue + } + return attribution +} + +func (m *ManagerImpl) Collect(out chan<- prometheus.Metric) { + m.attributionTracker.mu.RLock() + defer m.attributionTracker.mu.RUnlock() + for _, tracker := range m.attributionTracker.trackersByUserID { + tracker.Collect(out) + } +} + +// Describe implements prometheus.Collector. +func (m *ManagerImpl) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector +} diff --git a/pkg/costattribution/caimpl/managerImpl_test.go b/pkg/costattribution/caimpl/managerImpl_test.go new file mode 100644 index 00000000000..d22f50cde20 --- /dev/null +++ b/pkg/costattribution/caimpl/managerImpl_test.go @@ -0,0 +1,194 @@ +package caimpl + +import ( + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/grafana/mimir/pkg/util/validation" +) + +func newTestManager() *ManagerImpl { + logger := log.NewNopLogger() + limits, _ := validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(map[string]*validation.Limits{ + "user1": { + MaxCostAttributionPerUser: 5, + CostAttributionLabel: "team", + }, + "user2": { + MaxCostAttributionPerUser: 2, + CostAttributionLabel: "", + }, + "user3": { + MaxCostAttributionPerUser: 2, + CostAttributionLabel: "department", + }, + })) + inactiveTimeout := 2 * time.Minute + cooldownTimeout := 1 * time.Minute + cleanupInterval := 1 * time.Minute + return NewManager(cleanupInterval, inactiveTimeout, cooldownTimeout, logger, limits) +} + +func Test_NewManager(t *testing.T) { + manager := newTestManager() + assert.NotNil(t, manager, "Expected manager to be initialized") + assert.NotNil(t, manager.attributionTracker, "Expected attribution tracker to be initialized") + assert.Equal(t, "__unaccounted__", manager.invalidValue, "Expected invalidValue to be initialized") +} + +func Test_EnabledForUser(t *testing.T) { + manager := newTestManager() + assert.True(t, manager.EnabledForUser("user1"), "Expected cost attribution to be enabled for user1") + assert.False(t, manager.EnabledForUser("user2"), "Expected cost attribution to be disabled for user2") + assert.False(t, manager.EnabledForUser("user4"), "Expected cost attribution to be disabled for user4") +} + +func Test_GetUserAttributionLabel(t *testing.T) { + manager := newTestManager() + assert.Equal(t, "team", manager.GetUserAttributionLabel("user1")) + assert.Equal(t, "", manager.GetUserAttributionLabel("user2")) + assert.Equal(t, "department", manager.GetUserAttributionLabel("user3")) + assert.Equal(t, 2, len(manager.attributionTracker.trackersByUserID)) + assert.Equal(t, "team", manager.attributionTracker.trackersByUserID["user1"].trackedLabel) + assert.Equal(t, "department", manager.attributionTracker.trackersByUserID["user3"].trackedLabel) +} + +func Test_GetUserAttributionLimit(t *testing.T) { + manager := newTestManager() + assert.Equal(t, 5, manager.GetUserAttributionLimit("user1")) + assert.Equal(t, 0, manager.GetUserAttributionLimit("user2")) + assert.Equal(t, 0, manager.GetUserAttributionLimit("user4")) +} + +func Test_UpdateAttributionTimestamp(t *testing.T) { + manager := newTestManager() + + lbls := labels.NewBuilder(labels.EmptyLabels()) + tm1, tm2, tm3 := "bar", "foo", "baz" + t.Run("Should update the timestamp when limit not reached for the user attribution", func(t *testing.T) { + lbls.Set("department", tm1) + result := manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(0, 0)) + assert.Equal(t, tm1, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") + assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm1]) + assert.Equal(t, int64(0), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm1].Load()) + + lbls.Set("department", tm2) + result = manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(1, 0)) + assert.Equal(t, tm2, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") + assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm2]) + assert.Equal(t, int64(1), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm2].Load()) + }) + + t.Run("Should only update the timestamp of invalide when limit reached for the user attribution", func(t *testing.T) { + lbls.Set("department", tm3) + result := manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(2, 0)) + assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") + assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[manager.invalidValue]) + assert.Equal(t, int64(2), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[manager.invalidValue].Load()) + + lbls.Set("department", tm1) + result = manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(3, 0)) + assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") + assert.Equal(t, int64(3), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[manager.invalidValue].Load()) + }) +} + +func Test_SetActiveSeries(t *testing.T) { + manager := newTestManager() + reg := prometheus.NewRegistry() + err := reg.Register(manager) + require.NoError(t, err) + userID := "user1" + + lbls := labels.NewBuilder(labels.EmptyLabels()) + + t.Run("Should set the active series gauge for the given user and attribution", func(t *testing.T) { + lbls.Set("team", "foo") + val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + manager.SetActiveSeries(userID, val, 1.0) + expectedMetrics := ` + # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. + # TYPE cortex_ingester_active_series_attribution gauge + cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 + ` + metricNames := []string{ + "cortex_ingester_active_series_attribution", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Should set the active series gauge for all users and attributions enabled and ignore disabled user", func(t *testing.T) { + userID = "user3" + lbls.Set("department", "bar") + val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + manager.SetActiveSeries(userID, val, 2.0) + + lbls.Set("department", "baz") + val = manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + manager.SetActiveSeries(userID, val, 3.0) + + expectedMetrics := ` + # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. + # TYPE cortex_ingester_active_series_attribution gauge + cortex_ingester_active_series_attribution{department="bar",user="user3"} 2 + cortex_ingester_active_series_attribution{department="baz",user="user3"} 3 + cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 + ` + metricNames := []string{ + "cortex_ingester_active_series_attribution", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Cleanup the active series gauge for the given user and attribution when cost attribution disabled", func(t *testing.T) { + limits := manager.attributionTracker.limits + defer func() { manager.attributionTracker.limits = limits }() + userID = "user3" + lbls.Set("department", "baz") + + overrides, _ := validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(map[string]*validation.Limits{ + userID: { + MaxCostAttributionPerUser: 2, + CostAttributionLabel: "", + }, + })) + manager.attributionTracker.limits = overrides + val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(5, 0)) + manager.SetActiveSeries(userID, val, 3.0) + + expectedMetrics := ` + # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. + # TYPE cortex_ingester_active_series_attribution gauge + cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 + ` + metricNames := []string{ + "cortex_ingester_active_series_attribution", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Should ignore setting the active series gauge for disabled user", func(t *testing.T) { + userID = "user2" + lbls.Set("department", "bar") + val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + manager.SetActiveSeries(userID, val, 4.0) + + expectedMetrics := ` + # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. + # TYPE cortex_ingester_active_series_attribution gauge + cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 + ` + metricNames := []string{ + "cortex_ingester_active_series_attribution", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) +} diff --git a/pkg/costattribution/caimpl/tracker.go b/pkg/costattribution/caimpl/tracker.go new file mode 100644 index 00000000000..255b6413907 --- /dev/null +++ b/pkg/costattribution/caimpl/tracker.go @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package caimpl + +import ( + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/atomic" +) + +type Tracker struct { + trackedLabel string + attributionLimit int + activeSeriesPerUserAttribution *prometheus.GaugeVec + receivedSamplesAttribution *prometheus.CounterVec + discardedSampleAttribution *prometheus.CounterVec + attributionTimestamps map[string]*atomic.Int64 + coolDownDeadline *atomic.Int64 +} + +func (t *Tracker) cleanupTrackerAttribution(userID, attribution string) { + t.activeSeriesPerUserAttribution.DeleteLabelValues(userID, attribution) + t.receivedSamplesAttribution.DeleteLabelValues(userID, attribution) + t.discardedSampleAttribution.DeleteLabelValues(userID, attribution) +} + +func (t *Tracker) cleanupTracker(userID string) { + filter := prometheus.Labels{"user": userID} + t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) + t.receivedSamplesAttribution.DeletePartialMatch(filter) + t.discardedSampleAttribution.DeletePartialMatch(filter) +} + +func newTracker(trackedLabel string, limit int) (*Tracker, error) { + m := &Tracker{ + trackedLabel: trackedLabel, + attributionLimit: limit, + attributionTimestamps: map[string]*atomic.Int64{}, + coolDownDeadline: atomic.NewInt64(0), + discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_discarded_samples_attribution_total", + Help: "The total number of samples that were discarded per attribution.", + }, []string{"user", trackedLabel}), + receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_received_samples_attribution_total", + Help: "The total number of samples that were received per attribution.", + }, []string{"user", trackedLabel}), + activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_ingester_active_series_attribution", + Help: "The total number of active series per user and attribution.", + }, []string{"user", trackedLabel}), + } + return m, nil +} + +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + t.activeSeriesPerUserAttribution.Collect(out) + t.receivedSamplesAttribution.Collect(out) + t.discardedSampleAttribution.Collect(out) +} + +// Describe implements prometheus.Collector. +func (t *Tracker) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector +} diff --git a/pkg/costattribution/caimpl/tracker_group.go b/pkg/costattribution/caimpl/tracker_group.go new file mode 100644 index 00000000000..14214c7a876 --- /dev/null +++ b/pkg/costattribution/caimpl/tracker_group.go @@ -0,0 +1,200 @@ +package caimpl + +import ( + "sync" + "time" + + "go.uber.org/atomic" + + "github.com/grafana/mimir/pkg/util/validation" +) + +type AttributionTrackerGroup struct { + mu sync.RWMutex + trackersByUserID map[string]*Tracker + limits *validation.Overrides + cooldownTimeout time.Duration +} + +func newAttributionTrackerGroup(limits *validation.Overrides, cooldownTimeout time.Duration) *AttributionTrackerGroup { + return &AttributionTrackerGroup{ + trackersByUserID: make(map[string]*Tracker), + limits: limits, + mu: sync.RWMutex{}, + cooldownTimeout: cooldownTimeout, + } +} + +// getUserAttributionLabelFromCache is read user attribution label through cache, if not found, get from config +func (atg *AttributionTrackerGroup) getUserAttributionLabelFromCache(userID string) string { + atg.mu.RLock() + defer atg.mu.RUnlock() + // if the user is not enabled for cost attribution, we don't need to track the attribution + if atg.limits.CostAttributionLabel(userID) == "" { + return "" + } + if _, exists := atg.trackersByUserID[userID]; !exists { + atg.trackersByUserID[userID], _ = newTracker(atg.limits.CostAttributionLabel(userID), atg.limits.MaxCostAttributionPerUser(userID)) + } + return atg.trackersByUserID[userID].trackedLabel +} + +// getUserAttributionLimitFromCache is read per user attribution limit through cache, if not found, get from config +// always call only when the user is enabled for cost attribution +func (atg *AttributionTrackerGroup) getUserAttributionLimitFromCache(userID string) int { + atg.mu.Lock() + defer atg.mu.Unlock() + if _, exists := atg.trackersByUserID[userID]; !exists { + atg.trackersByUserID[userID], _ = newTracker(atg.limits.CostAttributionLabel(userID), atg.limits.MaxCostAttributionPerUser(userID)) + } + return atg.trackersByUserID[userID].attributionLimit +} + +// deleteUserTracerFromCache is delete user from cache since the user is disabled for cost attribution +func (atg *AttributionTrackerGroup) deleteUserTracerFromCache(userID string) { + atg.mu.Lock() + defer atg.mu.Unlock() + if _, exists := atg.trackersByUserID[userID]; !exists { + return + } + // clean up tracker metrics and delete the tracker + atg.trackersByUserID[userID].cleanupTracker(userID) + delete(atg.trackersByUserID, userID) +} + +// updateAttributionCacheForUser function is guaranteed to update label and limit for the user in the cache +// if the label has changed, we will create a new tracker, and won't update the timestamp +// if the label has not changed, we will update the attribution timestamp +// if the limit is set to 0 or label is empty, we skip the update +func (atg *AttributionTrackerGroup) updateAttributionCacheForUser(userID, label, attribution string, now time.Time) { + // If the limit is set to 0, we don't need to track the attribution, clean the cache if exists + if atg.limits.CostAttributionLabel(userID) == "" || atg.limits.MaxCostAttributionPerUser(userID) <= 0 { + atg.deleteUserTracerFromCache(userID) + return + } + ts := now.Unix() + + // if not in the cache, we create a new tracker + if atg.trackersByUserID[userID] == nil { + atg.trackersByUserID[userID], _ = newTracker(label, atg.limits.MaxCostAttributionPerUser(userID)) + } + + // if the label is not the one in the cache, we do nothing, that means the label input is outdated + if label != atg.getUserAttributionLabelFromCache(userID) { + return + } + + /// update attribution timestamp + if groupTs := atg.trackersByUserID[userID].attributionTimestamps[attribution]; groupTs != nil { + groupTs.Store(ts) + return + } + + // if the user attribution is not exist, we add an attribution timestamp + atg.mu.Lock() + defer atg.mu.Unlock() + atg.trackersByUserID[userID].attributionTimestamps[attribution] = atomic.NewInt64(ts) +} + +func (atg *AttributionTrackerGroup) purgeInactiveAttributionsForUser(userID string, deadline int64) []string { + atg.mu.RLock() + var inactiveAttributions []string + if atg.trackersByUserID[userID] == nil { + return nil + } + atg.mu.RUnlock() + + atg.mu.Lock() + if atg.trackersByUserID[userID].trackedLabel != atg.limits.CostAttributionLabel(userID) { + // reset everything if the label has changed + atg.trackersByUserID[userID], _ = newTracker(atg.limits.CostAttributionLabel(userID), atg.limits.MaxCostAttributionPerUser(userID)) + } + atg.mu.Unlock() + + atg.mu.RLock() + attributionTimestamps := atg.trackersByUserID[userID].attributionTimestamps + if attributionTimestamps == nil { + return nil + } + for attr, ts := range attributionTimestamps { + if ts.Load() <= deadline { + inactiveAttributions = append(inactiveAttributions, attr) + } + } + atg.mu.RUnlock() + if len(inactiveAttributions) == 0 { + return nil + } + + // Cleanup inactive groups + atg.mu.Lock() + defer atg.mu.Unlock() + + for i := 0; i < len(inactiveAttributions); { + inactiveAttribution := inactiveAttributions[i] + groupTs := atg.trackersByUserID[userID].attributionTimestamps[inactiveAttribution] + if groupTs != nil && groupTs.Load() <= deadline { + delete(atg.trackersByUserID[userID].attributionTimestamps, inactiveAttribution) + i++ + } else { + inactiveAttributions[i] = inactiveAttributions[len(inactiveAttributions)-1] + inactiveAttributions = inactiveAttributions[:len(inactiveAttributions)-1] + } + } + + return inactiveAttributions +} + +func (atg *AttributionTrackerGroup) purgeInactiveAttributions(inactiveTimeout time.Duration) { + atg.mu.RLock() + userIDs := make([]string, 0, len(atg.trackersByUserID)) + for userID := range atg.trackersByUserID { + userIDs = append(userIDs, userID) + } + atg.mu.RUnlock() + + currentTime := time.Now() + for _, userID := range userIDs { + if atg.limits.CostAttributionLabel(userID) == "" || atg.limits.MaxCostAttributionPerUser(userID) <= 0 { + atg.deleteUserTracerFromCache(userID) + continue + } + // purge inactive attributions + inactiveAttributions := atg.purgeInactiveAttributionsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano()) + for _, attribution := range inactiveAttributions { + atg.trackersByUserID[userID].cleanupTrackerAttribution(userID, attribution) + } + } +} + +func (atg *AttributionTrackerGroup) attributionLimitExceeded(userID, attribution string, now time.Time) bool { + // if we are still at the cooldown period, we will consider the limit reached + atg.mu.RLock() + defer atg.mu.RUnlock() + // if the user is not exist, we don't need to check the limit + if atg.trackersByUserID[userID] == nil { + return false + } + + if v := atg.trackersByUserID[userID].coolDownDeadline; v != nil && v.Load() > now.UnixNano() { + return true + } + + // if the user attribution is already exist and we are not in the cooldown period, we don't need to check the limit + _, exists := atg.trackersByUserID[userID].attributionTimestamps[attribution] + if exists { + return false + } + + // if the user has reached the limit, we will set the cooldown period which is 20 minutes + maxReached := len(atg.trackersByUserID[userID].attributionTimestamps) >= atg.limits.MaxCostAttributionPerUser(userID) + if maxReached { + // if cooldownTimeout is set, we will set the cooldown period + if atg.cooldownTimeout != 0 { + atg.trackersByUserID[userID].coolDownDeadline.Store(now.Add(atg.cooldownTimeout).UnixNano()) + } + return true + } + + return maxReached +} diff --git a/pkg/costattribution/caimpl/tracker_group_test.go b/pkg/costattribution/caimpl/tracker_group_test.go new file mode 100644 index 00000000000..5fbe148ae77 --- /dev/null +++ b/pkg/costattribution/caimpl/tracker_group_test.go @@ -0,0 +1,97 @@ +package caimpl + +import ( + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + "github.com/grafana/mimir/pkg/util/validation" +) + +func TestUpdateAttributionTimestampForUser(t *testing.T) { + cooldownTimeout := 10 * time.Second + t.Run("Should not update the timestamp for the user if attribution lable is not set", func(t *testing.T) { + // Create mock limits + limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "", MaxCostAttributionPerUser: 5}, nil) + assert.NoError(t, err) + trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) + assert.NotNil(t, trackerGroup) + + ts := time.Unix(1, 0) + trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "platformA", ts) + trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "teamB", ts) + + assert.Equal(t, 0, len(trackerGroup.trackersByUserID)) + }) + + t.Run("Should not update the timestamp for the user if max cost attribution per user is 0", func(t *testing.T) { + // Create mock limits + limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 0}, nil) + assert.NoError(t, err) + + trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) + assert.NotNil(t, trackerGroup) + + ts := time.Unix(1, 0) + trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "platformA", ts) + trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "teamB", ts) + + assert.Equal(t, 0, len(trackerGroup.trackersByUserID)) + }) + + t.Run("Should update the timestamp for the user attribution", func(t *testing.T) { + // Create mock limits + limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) + assert.NoError(t, err) + + trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) + assert.NotNil(t, trackerGroup) + + ts := time.Unix(1, 0) + trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", ts) + trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "barA", ts) + + assert.Equal(t, 2, len(trackerGroup.trackersByUserID)) + fmt.Println(trackerGroup.trackersByUserID) + assert.NotNil(t, trackerGroup.trackersByUserID["tenantA"]) + assert.NotNil(t, trackerGroup.trackersByUserID["tenantA"].attributionTimestamps["fooA"]) + assert.Equal(t, int64(1), trackerGroup.trackersByUserID["tenantA"].attributionTimestamps["fooA"].Load()) + + trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "barA", ts.Add(time.Second)) + assert.Equal(t, int64(2), trackerGroup.trackersByUserID["tenantB"].attributionTimestamps["barA"].Load()) + }) +} + +func TestGetUserAttributionLabel(t *testing.T) { + cooldownTimeout := 10 * time.Second + t.Run("Should return the cost attribution label for the user", func(t *testing.T) { + // Create mock limits + limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) + assert.NoError(t, err) + + trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) + assert.NotNil(t, trackerGroup) + trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", time.Unix(0, 0)) + + assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) + }) + + t.Run("Should return the default cost attribution label for the user if it is in cache", func(t *testing.T) { + // Create mock limits + limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) + assert.NoError(t, err) + + trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) + assert.NotNil(t, trackerGroup) + + assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) + + // update the timestamp for the user, so cache is updated + trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", time.Unix(0, 0)) + + // still read the cost attribution label from cache until cache is updated by timed service + assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) + }) +} diff --git a/pkg/costattribution/caimpl/tracker_test.go b/pkg/costattribution/caimpl/tracker_test.go new file mode 100644 index 00000000000..7427e6a1540 --- /dev/null +++ b/pkg/costattribution/caimpl/tracker_test.go @@ -0,0 +1,55 @@ +package caimpl + +import ( + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func Test_NewTracker(t *testing.T) { + reg := prometheus.NewRegistry() + + // Initialize a new Tracker + trackedLabel := "platform" + tracker, err := newTracker(trackedLabel, 5) + require.NoError(t, err) + err = reg.Register(tracker) + require.NoError(t, err) + + // Simulate some values in the metrics + userID := "user1" + attribution := "foo" + tracker.activeSeriesPerUserAttribution.WithLabelValues(userID, attribution).Set(1.0) + tracker.receivedSamplesAttribution.WithLabelValues(userID, attribution).Add(5) + tracker.discardedSampleAttribution.WithLabelValues(userID, attribution).Add(2) + + expectedMetrics := ` + # HELP cortex_discarded_samples_attribution_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_samples_attribution_total counter + cortex_discarded_samples_attribution_total{platform="foo",user="user1"} 2 + # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. + # TYPE cortex_ingester_active_series_attribution gauge + cortex_ingester_active_series_attribution{platform="foo",user="user1"} 1 + # HELP cortex_received_samples_attribution_total The total number of samples that were received per attribution. + # TYPE cortex_received_samples_attribution_total counter + cortex_received_samples_attribution_total{platform="foo",user="user1"} 5 + ` + + metricNames := []string{ + "cortex_discarded_samples_attribution_total", + "cortex_received_samples_attribution_total", + "cortex_ingester_active_series_attribution", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + + // Clean the tracker for the user attribution + tracker.cleanupTrackerAttribution(userID, attribution) + + // Verify that metrics have been cleaned + expectedMetrics = `` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) +} diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go new file mode 100644 index 00000000000..60c2b92517c --- /dev/null +++ b/pkg/costattribution/manager.go @@ -0,0 +1,24 @@ +package costattribution + +import ( + "time" + + "github.com/grafana/dskit/services" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" +) + +type Manager interface { + services.Service + + EnabledForUser(userID string) bool + GetUserAttributionLabel(userID string) string + GetUserAttributionLimit(userID string) int + UpdateAttributionTimestamp(user string, lbs labels.Labels, now time.Time) string + SetActiveSeries(userID, attribution string, value float64) + IncrementDiscardedSamples(userID, attribution string, value float64) + IncrementReceivedSamples(userID, attribution string, value float64) + + Collect(out chan<- prometheus.Metric) + Describe(chan<- *prometheus.Desc) +} diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index d127bfe00d1..7fdeb4a3622 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -48,12 +48,12 @@ import ( "golang.org/x/sync/errgroup" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" "github.com/grafana/mimir/pkg/storage/ingest" "github.com/grafana/mimir/pkg/util" - "github.com/grafana/mimir/pkg/util/costattribution" "github.com/grafana/mimir/pkg/util/globalerror" mimir_limiter "github.com/grafana/mimir/pkg/util/limiter" util_math "github.com/grafana/mimir/pkg/util/math" @@ -106,7 +106,7 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - costAttributionSvc *costattribution.CostAttributionCleanupService + costAttributionMng costattribution.Manager // For handling HA replicas. HATracker *haTracker @@ -307,7 +307,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionClenaupService *costattribution.CostAttributionCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -342,7 +342,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove healthyInstancesCount: atomic.NewUint32(0), limits: limits, HATracker: haTracker, - costAttributionSvc: costAttributionClenaupService, + costAttributionMng: costAttributionMng, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -1669,29 +1669,28 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string, now time.Time) { var receivedSamples, receivedExemplars, receivedMetadata int - costAttributionSize := 0 - caEnabled := d.costAttributionSvc != nil && d.costAttributionSvc.EnabledForUser(userID) + costattributionLimit := 0 + caEnabled := d.costAttributionMng != nil && d.costAttributionMng.EnabledForUser(userID) if caEnabled { - costAttributionSize = d.costAttributionSvc.GetUserAttributionLimit(userID) + costattributionLimit = d.costAttributionMng.GetUserAttributionLimit(userID) } - costAttribution := make(map[string]int, costAttributionSize) + costAttribution := make(map[string]int, costattributionLimit) for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) if caEnabled { - attribution := d.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), now) + attribution := d.costAttributionMng.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), now) costAttribution[attribution]++ } } receivedMetadata = len(req.Metadata) if caEnabled { for lv, count := range costAttribution { - d.costAttributionSvc.IncrementReceivedSamples(userID, lv, float64(count)) + d.costAttributionMng.IncrementReceivedSamples(userID, lv, float64(count)) } - } else { - d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) } + d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) d.receivedExemplars.WithLabelValues(userID).Add(float64(receivedExemplars)) d.receivedMetadata.WithLabelValues(userID).Add(float64(receivedMetadata)) } diff --git a/pkg/distributor/distributor_ingest_storage_test.go b/pkg/distributor/distributor_ingest_storage_test.go index 77b22cc9967..e1c7e2be21f 100644 --- a/pkg/distributor/distributor_ingest_storage_test.go +++ b/pkg/distributor/distributor_ingest_storage_test.go @@ -209,7 +209,7 @@ func TestDistributor_Push_ShouldSupportIngestStorage(t *testing.T) { # HELP cortex_distributor_received_samples_total The total number of received samples, excluding rejected and deduped samples. # TYPE cortex_distributor_received_samples_total counter - cortex_distributor_received_samples_total{ user="user"} 5 + cortex_distributor_received_samples_total{user="user"} 5 # HELP cortex_distributor_metadata_in_total The total number of metadata the have come in to the distributor, including rejected. # TYPE cortex_distributor_metadata_in_total counter diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index a7159d91154..e826af69a67 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -6953,6 +6953,7 @@ func TestDistributor_MetricsWithRequestModifications(t *testing.T) { exemplarsIn: 10, metadataIn: 10, receivedRequests: 1, + receivedSamples: 0, receivedExemplars: 0, receivedMetadata: 10}) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 7cc2d39554b..56abe4f4527 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -11,13 +11,13 @@ import ( "sync" "time" - "github.com/grafana/mimir/pkg/util/costattribution" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -50,7 +50,7 @@ type ActiveSeries struct { matchers *asmodel.Matchers lastMatchersUpdate time.Time - costAttributionSvc *costattribution.CostAttributionCleanupService + costAttributionMng costattribution.Manager // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -68,7 +68,7 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - costAttributionSvc *costattribution.CostAttributionCleanupService + costAttributionMng costattribution.Manager mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -97,16 +97,16 @@ func NewActiveSeries( asm *asmodel.Matchers, timeout time.Duration, userID string, - costAttributionSvc *costattribution.CostAttributionCleanupService, + costAttributionMng costattribution.Manager, ) *ActiveSeries { c := &ActiveSeries{ matchers: asm, timeout: timeout, userID: userID, - costAttributionSvc: costAttributionSvc, + costAttributionMng: costAttributionMng, } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionSvc) + c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionMng) } return c @@ -123,7 +123,7 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { defer c.matchersMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionSvc) + c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionMng) } c.matchers = asm c.lastMatchersUpdate = now @@ -230,7 +230,7 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot } func (c *ActiveSeries) ActiveByAttributionValue() map[string]uint32 { - total := make(map[string]uint32, c.costAttributionSvc.GetUserAttributionLimit(c.userID)) + total := make(map[string]uint32, c.costAttributionMng.GetUserAttributionLimit(c.userID)) for s := 0; s < numStripes; s++ { c.stripes[s].mu.RLock() for k, v := range c.stripes[s].costAttributionValues { @@ -426,8 +426,8 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef // here if we have a cost attribution label, we can split the serie count based on the value of the label // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly - if s.costAttributionSvc != nil && s.costAttributionSvc.GetUserAttributionLabel(s.userID) != "" { - attributionValue := s.costAttributionSvc.UpdateAttributionTimestamp(s.userID, series, time.Unix(0, nowNanos)) + if s.costAttributionMng != nil && s.costAttributionMng.GetUserAttributionLabel(s.userID) != "" { + attributionValue := s.costAttributionMng.UpdateAttributionTimestamp(s.userID, series, time.Unix(0, nowNanos)) s.costAttributionValues[attributionValue]++ e.attributionValue = attributionValue } @@ -459,7 +459,7 @@ func (s *seriesStripe) reinitialize( asm *asmodel.Matchers, deleted *deletedSeries, userID string, - costAttributionSvc *costattribution.CostAttributionCleanupService, + costAttributionMng costattribution.Manager, ) { s.mu.Lock() defer s.mu.Unlock() @@ -475,7 +475,7 @@ func (s *seriesStripe) reinitialize( s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) - s.costAttributionSvc = costAttributionSvc + s.costAttributionMng = costAttributionMng } func (s *seriesStripe) purge(keepUntil time.Time) { diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 43bd42d4f0a..a93753fac7f 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -51,6 +51,7 @@ import ( "golang.org/x/exp/slices" "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester/activeseries" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" @@ -63,7 +64,6 @@ import ( "github.com/grafana/mimir/pkg/storage/tsdb/block" "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" - "github.com/grafana/mimir/pkg/util/costattribution" "github.com/grafana/mimir/pkg/util/globalerror" "github.com/grafana/mimir/pkg/util/limiter" util_log "github.com/grafana/mimir/pkg/util/log" @@ -311,7 +311,7 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService - costAttributionSvc *costattribution.CostAttributionCleanupService + costAttributionMng costattribution.Manager tsdbMetrics *tsdbMetrics @@ -380,7 +380,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionCleanupService *costattribution.CostAttributionCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -388,7 +388,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.ingestionRate = util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval) i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService - i.costAttributionSvc = costAttributionCleanupService + i.costAttributionMng = costAttributionMng // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -788,15 +788,13 @@ func (i *Ingester) updateActiveSeries(now time.Time) { allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := userDB.activeSeries.ActiveWithMatchers() i.metrics.activeSeriesLoading.DeleteLabelValues(userID) if allActive > 0 { - caEnabled := i.costAttributionSvc != nil && i.costAttributionSvc.EnabledForUser(userID) - if caEnabled { + if i.isCostAttributionEnabledForUser(userID) { labelAttributions := userDB.activeSeries.ActiveByAttributionValue() for label, count := range labelAttributions { - i.costAttributionSvc.SetActiveSeries(userID, label, float64(count)) + i.costAttributionMng.SetActiveSeries(userID, label, float64(count)) } - } else { - i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(allActive)) } + i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(allActive)) } else { i.metrics.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) } @@ -1283,13 +1281,17 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats db.ingestedAPISamples.Add(int64(stats.succeededSamplesCount)) } } - if i.costAttributionSvc != nil && i.costAttributionSvc.EnabledForUser(userID) { + if i.isCostAttributionEnabledForUser(userID) { for label, count := range stats.failedSamplesAttribution { - i.costAttributionSvc.IncrementDiscardedSamples(userID, label, float64(count)) + i.costAttributionMng.IncrementDiscardedSamples(userID, label, float64(count)) } } } +func (i *Ingester) isCostAttributionEnabledForUser(userID string) bool { + return i.costAttributionMng != nil && i.costAttributionMng.EnabledForUser(userID) +} + // pushSamplesToAppender appends samples and exemplars to the appender. Most errors are handled via updateFirstPartial function, // but in case of unhandled errors, appender is rolled back and such error is returned. Errors handled by updateFirstPartial // must be of type softError. @@ -1298,12 +1300,12 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { // Return true if handled as soft error, and we can ingest more series. // get the cost attribution value for the series - costLabel := i.limits.CostAttributionLabel(userID) + caEnabled := i.isCostAttributionEnabledForUser(userID) handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { - if costLabel != "" { + if caEnabled { // get the label value and update the timestamp, // if the cordianlity is reached or we are currently in cooldown period, function would returned __unaccounted__ - costAttrib := i.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(labels), startAppend) + costAttrib := i.costAttributionMng.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(labels), startAppend) stats.failedSamplesAttribution[costAttrib]++ } @@ -1413,8 +1415,8 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre for _, ts := range timeseries { var costAttrib string // when cost attribution label is set - if costLabel != "" { - costAttrib = i.costAttributionSvc.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), startAppend) + if caEnabled { + costAttrib = i.costAttributionMng.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), startAppend) } // The labels must be sorted (in our case, it's guaranteed a write request @@ -1432,7 +1434,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleOutOfBoundsCount += len(ts.Samples) + len(ts.Histograms) - if costLabel != "" { + if caEnabled { stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) + len(ts.Histograms) } var firstTimestamp int64 @@ -1455,7 +1457,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) stats.sampleOutOfBoundsCount += len(ts.Samples) - if costLabel != "" { + if caEnabled { stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) } firstTimestamp := ts.Samples[0].TimestampMs @@ -2671,7 +2673,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, userID, - i.costAttributionSvc, + i.costAttributionMng, ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 3efd678b1a8..c7459d1c171 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -51,6 +51,7 @@ import ( "github.com/grafana/mimir/pkg/blockbuilder" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -74,7 +75,6 @@ import ( "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" "github.com/grafana/mimir/pkg/util/activitytracker" - "github.com/grafana/mimir/pkg/util/costattribution" util_log "github.com/grafana/mimir/pkg/util/log" "github.com/grafana/mimir/pkg/util/noauth" "github.com/grafana/mimir/pkg/util/process" @@ -110,7 +110,6 @@ type Config struct { NoAuthTenant string `yaml:"no_auth_tenant" category:"advanced"` ShutdownDelay time.Duration `yaml:"shutdown_delay" category:"advanced"` MaxSeparateMetricsGroupsPerUser int `yaml:"max_separate_metrics_groups_per_user" category:"experimental"` - CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` EnableGoRuntimeMetrics bool `yaml:"enable_go_runtime_metrics" category:"advanced"` PrintConfig bool `yaml:"-"` ApplicationName string `yaml:"-"` @@ -148,7 +147,9 @@ type Config struct { Common CommonConfig `yaml:"common"` CustomRegistryPath string `yaml:"custom_registry_path" category:"advanced"` - TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` + CostAttributionCoolDownDuration time.Duration `yaml:"cost_attribution_cool_down_duration" category:"experimental"` } // RegisterFlags registers flags. @@ -171,11 +172,12 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.StringVar(&c.NoAuthTenant, "auth.no-auth-tenant", "anonymous", "Tenant ID to use when multitenancy is disabled.") f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.") f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Mimir will report not-ready status via /ready endpoint.") - f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 10*time.Minute, "Interval at which to evict inactive cost attributions.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 30*time.Minute, "Interval at which to evict inactive cost attributions.") + f.DurationVar(&c.CostAttributionCoolDownDuration, "cost-attribution-cool-down-duration", 20*time.Minute, "Duration for which to keep the cost attribution active after the last update.") f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") - + f.StringVar(&c.CustomRegistryPath, "custom-registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) c.Distributor.RegisterFlags(f, logger) @@ -714,7 +716,7 @@ type Mimir struct { TenantLimits validation.TenantLimits Overrides *validation.Overrides ActiveGroupsCleanup *util.ActiveGroupsCleanupService - CostAttributionCleanup *costattribution.CostAttributionCleanupService + CostAttributionManager costattribution.Manager Distributor *distributor.Distributor Ingester *ingester.Ingester diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 8da436becf1..a0e82862160 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -29,6 +29,7 @@ import ( "github.com/prometheus/alertmanager/featurecontrol" "github.com/prometheus/alertmanager/matchers/compat" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/common/config" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/rules" @@ -60,13 +61,11 @@ import ( "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" "github.com/grafana/mimir/pkg/util/activitytracker" - "github.com/grafana/mimir/pkg/util/costattribution" util_log "github.com/grafana/mimir/pkg/util/log" "github.com/grafana/mimir/pkg/util/validation" "github.com/grafana/mimir/pkg/util/validation/exporter" "github.com/grafana/mimir/pkg/util/version" "github.com/grafana/mimir/pkg/vault" - "github.com/prometheus/client_golang/prometheus/promhttp" ) // The various modules that make up Mimir. @@ -464,7 +463,7 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.Cfg.Distributor.IngestStorageConfig = t.Cfg.IngestStorage t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, - t.ActiveGroupsCleanup, t.CostAttributionCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, + t.ActiveGroupsCleanup, t.CostAttributionManager, t.IngesterRing, t.IngesterPartitionInstanceRing, canJoinDistributorsRing, t.Registerer, util_log.Logger) if err != nil { return @@ -648,17 +647,18 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { } func (t *Mimir) initCostAttributionService() (services.Service, error) { + // The cost attribution service is only initilized if the custom registry path is provided. if t.Cfg.CustomRegistryPath != "" { + // if custom registry path is provided, create a custom registry and use it for cost attribution service customRegistry := prometheus.NewRegistry() // Register the custom registry with the provided URL. // This allows users to expose custom metrics on a separate endpoint. // This is useful when users want to expose metrics that are not part of the default Mimir metrics. http.Handle(t.Cfg.CustomRegistryPath, promhttp.HandlerFor(customRegistry, promhttp.HandlerOpts{Registry: customRegistry})) - t.CostAttributionCleanup = costattribution.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, customRegistry) - return t.CostAttributionCleanup, nil + err := customRegistry.Register(t.CostAttributionManager) + return t.CostAttributionManager, err } - t.CostAttributionCleanup = costattribution.NewCostAttributionCleanupService(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, t.Registerer) - return t.CostAttributionCleanup, nil + return nil, nil } func (t *Mimir) tsdbIngesterConfig() { @@ -672,7 +672,7 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.IngestStorageConfig = t.Cfg.IngestStorage t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionCleanup, t.Registerer, util_log.Logger) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionManager, t.Registerer, util_log.Logger) if err != nil { return } diff --git a/pkg/util/costattribution/cost_attribution.go b/pkg/util/costattribution/cost_attribution.go deleted file mode 100644 index ab47346a35d..00000000000 --- a/pkg/util/costattribution/cost_attribution.go +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package costattribution - -import ( - "context" - "fmt" - "sync" - "time" - - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/grafana/dskit/services" - "github.com/grafana/mimir/pkg/util/validation" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/prometheus/model/labels" - "go.uber.org/atomic" -) - -type Tracker struct { - trackedLabel string - activeSeriesPerUserAttribution *prometheus.GaugeVec - receivedSamplesAttribution *prometheus.CounterVec - discardedSampleAttribution *prometheus.CounterVec - attributionTimestamps map[string]*atomic.Int64 - coolDownDeadline *atomic.Int64 -} - -func (m *Tracker) RemoveAttributionMetricsForUser(userID, attribution string) { - m.activeSeriesPerUserAttribution.DeleteLabelValues(userID, attribution) - m.receivedSamplesAttribution.DeleteLabelValues(userID, attribution) - m.discardedSampleAttribution.DeleteLabelValues(userID, attribution) -} - -func NewCostAttributionTracker(reg prometheus.Registerer, trackedLabel string) *Tracker { - m := &Tracker{ - trackedLabel: trackedLabel, - attributionTimestamps: map[string]*atomic.Int64{}, - coolDownDeadline: atomic.NewInt64(0), - discardedSampleAttribution: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_discarded_samples_attribution_total", - Help: "The total number of samples that were discarded per attribution.", - }, []string{"user", trackedLabel}), - receivedSamplesAttribution: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_received_samples_attribution_total", - Help: "The total number of samples that were received per attribution.", - }, []string{"user", trackedLabel}), - activeSeriesPerUserAttribution: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_ingester_active_series_attribution", - Help: "The total number of active series per user and attribution.", - }, []string{"user", trackedLabel}), - } - return m -} - -type CostAttribution struct { - mu sync.RWMutex - trackers map[string]*Tracker - limits *validation.Overrides - reg prometheus.Registerer -} - -func NewCostAttribution(limits *validation.Overrides, reg prometheus.Registerer) *CostAttribution { - return &CostAttribution{ - trackers: make(map[string]*Tracker), - limits: limits, - reg: reg, - mu: sync.RWMutex{}, - } -} - -// UpdateAttributionTimestampForUser function is only guaranteed to update to the -// timestamp provided even if it is smaller than the existing value -func (ca *CostAttribution) UpdateAttributionTimestampForUser(userID, attribution string, now time.Time) { - // If the limit is set to 0, we don't need to track the attribution - if ca.limits.MaxCostAttributionPerUser(userID) <= 0 { - return - } - - ts := now.UnixNano() - ca.mu.Lock() - // create new tracker if not exists - if _, exists := ca.trackers[userID]; !exists { - // the attribution label and values should be managed by cache - ca.trackers[userID] = NewCostAttributionTracker(ca.reg, ca.limits.CostAttributionLabel(userID)) - } - ca.mu.Unlock() - ca.mu.RLock() - if groupTs := ca.trackers[userID].attributionTimestamps[attribution]; groupTs != nil { - groupTs.Store(ts) - return - } - ca.mu.RUnlock() - ca.mu.Lock() - defer ca.mu.Unlock() - ca.trackers[userID].attributionTimestamps[attribution] = atomic.NewInt64(ts) -} - -func (ca *CostAttribution) purgeInactiveAttributionsForUser(userID string, deadline int64) []string { - ca.mu.RLock() - var inactiveAttributions []string - if ca.trackers[userID] == nil || ca.trackers[userID].attributionTimestamps == nil { - return nil - } - - attributionTimestamps := ca.trackers[userID].attributionTimestamps - for attr, ts := range attributionTimestamps { - if ts.Load() <= deadline { - inactiveAttributions = append(inactiveAttributions, attr) - } - } - ca.mu.RUnlock() - - if len(inactiveAttributions) == 0 { - return nil - } - - // Cleanup inactive groups - ca.mu.Lock() - defer ca.mu.Unlock() - - for i := 0; i < len(inactiveAttributions); { - inactiveAttribution := inactiveAttributions[i] - groupTs := ca.trackers[userID].attributionTimestamps[inactiveAttribution] - if groupTs != nil && groupTs.Load() <= deadline { - delete(ca.trackers[userID].attributionTimestamps, inactiveAttribution) - i++ - } else { - inactiveAttributions[i] = inactiveAttributions[len(inactiveAttributions)-1] - inactiveAttributions = inactiveAttributions[:len(inactiveAttributions)-1] - } - } - - return inactiveAttributions -} - -func (ca *CostAttribution) purgeInactiveAttributions(inactiveTimeout time.Duration) { - ca.mu.RLock() - userIDs := make([]string, 0, len(ca.trackers)) - for userID := range ca.trackers { - userIDs = append(userIDs, userID) - } - ca.mu.RUnlock() - - currentTime := time.Now() - for _, userID := range userIDs { - inactiveAttributions := ca.purgeInactiveAttributionsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano()) - for _, attribution := range inactiveAttributions { - ca.trackers[userID].RemoveAttributionMetricsForUser(userID, attribution) - } - } -} - -func (ca *CostAttribution) attributionLimitExceeded(userID, attribution string) bool { - // if we are still at the cooldown period, we will consider the limit reached - ca.mu.RLock() - defer ca.mu.RUnlock() - // if the user is not exist, we don't need to check the limit - if ca.trackers[userID] == nil { - return false - } - - now := time.Now() - if v := ca.trackers[userID].coolDownDeadline; v != nil && v.Load() > now.UnixNano() { - return true - } - - // if the user attribution is already exist and we are not in the cooldown period, we don't need to check the limit - _, exists := ca.trackers[userID].attributionTimestamps[attribution] - if exists { - return false - } - - // if the user has reached the limit, we will set the cooldown period which is 20 minutes - maxReached := len(ca.trackers[userID].attributionTimestamps) >= ca.limits.MaxCostAttributionPerUser(userID) - if maxReached { - ca.mu.Lock() - ca.trackers[userID].coolDownDeadline.Store(now.Add(20 * time.Minute).UnixNano()) - ca.mu.Unlock() - return true - } - - return maxReached -} - -type CostAttributionCleanupService struct { - services.Service - logger log.Logger - costAttribution *CostAttribution - inactiveTimeout time.Duration - invalidValue string -} - -type CostAttributionMetricsCleaner interface { - RemoveAttributionMetricsForUser(userID, attribution string) -} - -func NewCostAttributionCleanupService(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg prometheus.Registerer) *CostAttributionCleanupService { - s := &CostAttributionCleanupService{ - costAttribution: NewCostAttribution(limits, reg), - inactiveTimeout: inactiveTimeout, - logger: logger, - invalidValue: "__unaccounted__", - } - - s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution cleanup") - return s -} - -// IncrementReceivedSamples increments the received samples counter for a given user and attribution -func (s *CostAttributionCleanupService) IncrementReceivedSamples(userID, attribution string, value float64) { - attribution = s.GetUserAttribution(userID, attribution) - s.costAttribution.mu.RLock() - defer s.costAttribution.mu.RUnlock() - if tracker, exists := s.costAttribution.trackers[userID]; exists { - tracker.receivedSamplesAttribution.WithLabelValues(userID, attribution).Add(value) - } -} - -// IncrementDiscardedSamples increments the discarded samples counter for a given user and attribution -func (s *CostAttributionCleanupService) IncrementDiscardedSamples(userID, attribution string, value float64) { - attribution = s.GetUserAttribution(userID, attribution) - s.costAttribution.mu.RLock() - defer s.costAttribution.mu.RUnlock() - if tracker, exists := s.costAttribution.trackers[userID]; exists { - tracker.discardedSampleAttribution.WithLabelValues(userID, attribution).Add(value) - } -} - -// SetActiveSeries sets the active series gauge for a given user and attribution -func (s *CostAttributionCleanupService) SetActiveSeries(userID, attribution string, value float64) { - attribution = s.GetUserAttribution(userID, attribution) - s.costAttribution.mu.RLock() - defer s.costAttribution.mu.RUnlock() - if tracker, exists := s.costAttribution.trackers[userID]; exists { - tracker.activeSeriesPerUserAttribution.WithLabelValues(userID, attribution).Set(value) - } -} - -func (s *CostAttributionCleanupService) GetUserAttribution(userID, attribution string) string { - // not tracking cost attribution for this user, this shouldn't happen - if s.costAttribution.limits.MaxCostAttributionPerUser(userID) <= 0 { - return attribution - } - if s.costAttribution.attributionLimitExceeded(userID, attribution) { - return s.invalidValue - } - return attribution -} - -func (s *CostAttributionCleanupService) GetUserAttributionLabel(userID string) string { - s.costAttribution.mu.RLock() - defer s.costAttribution.mu.RUnlock() - if s.costAttribution != nil { - if val, exists := s.costAttribution.trackers[userID]; exists { - return val.trackedLabel - } - } - return "" -} - -func (s *CostAttributionCleanupService) EnabledForUser(userID string) bool { - return s.costAttribution.limits.CostAttributionLabel(userID) != "" -} - -func (s *CostAttributionCleanupService) GetUserAttributionLimit(userID string) int { - return s.costAttribution.limits.MaxCostAttributionPerUser(userID) -} - -func (s *CostAttributionCleanupService) UpdateAttributionTimestamp(user string, lbs labels.Labels, now time.Time) string { - if s.costAttribution.trackers[user] == nil || s.costAttribution.trackers[user].trackedLabel == "" { - return "" - } - attribution := lbs.Get(s.costAttribution.trackers[user].trackedLabel) - // empty label is not normal, if user set attribution label, the metrics send has to include the label - if attribution == "" { - level.Error(s.logger).Log("msg", "set attribution label to \"\" since missing cost attribution label in metrics") - return attribution - } - - if s.costAttribution.attributionLimitExceeded(user, attribution) { - attribution = s.invalidValue - level.Error(s.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since user has reached the limit of cost attribution labels", s.invalidValue)) - } - - s.costAttribution.UpdateAttributionTimestampForUser(user, attribution, now) - return attribution -} - -func (s *CostAttributionCleanupService) iteration(_ context.Context) error { - s.costAttribution.purgeInactiveAttributions(s.inactiveTimeout) - return nil -} diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 1d14d03baca..b2e44a8074c 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -143,10 +143,6 @@ type Limits struct { // User defined label to give the option of subdividing specific metrics by another label SeparateMetricsGroupLabel string `yaml:"separate_metrics_group_label" json:"separate_metrics_group_label" category:"experimental"` - // User defined label to give the cost distribution by values of the label - CostAttributionLabel string `yaml:"cost_attribution_label" json:"cost_attribution_label" category:"experimental"` - MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" json:"max_cost_attribution_per_user" category:"experimental"` - // Querier enforced limits. MaxChunksPerQuery int `yaml:"max_fetched_chunks_per_query" json:"max_fetched_chunks_per_query"` MaxEstimatedChunksPerQueryMultiplier float64 `yaml:"max_estimated_fetched_chunks_per_query_multiplier" json:"max_estimated_fetched_chunks_per_query_multiplier" category:"experimental"` @@ -183,6 +179,10 @@ type Limits struct { LabelValuesMaxCardinalityLabelNamesPerRequest int `yaml:"label_values_max_cardinality_label_names_per_request" json:"label_values_max_cardinality_label_names_per_request"` ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` + // Cost attribution and limit. + CostAttributionLabel string `yaml:"cost_attribution_label" json:"cost_attribution_label" category:"experimental"` + MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" json:"max_cost_attribution_per_user" category:"experimental"` + // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` From b733e445866f5bcde2fd7beeebbde6d5b7efc654 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 7 Oct 2024 18:17:26 +0200 Subject: [PATCH 08/13] add cache clean up process --- cmd/mimir/config-descriptor.json | 8 ++-- cmd/mimir/help-all.txt.tmpl | 8 ++-- pkg/costattribution/caimpl/managerImpl.go | 10 +++-- .../caimpl/managerImpl_test.go | 27 ++++++++---- pkg/costattribution/manager.go | 2 +- pkg/distributor/distributor.go | 16 ++++--- pkg/ingester/activeseries/active_series.go | 24 +++++++++-- pkg/ingester/ingester.go | 43 +++++++++++-------- pkg/mimir/mimir.go | 4 +- pkg/util/validation/limits.go | 4 +- 10 files changed, 94 insertions(+), 52 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index ae8d6f78b1a..55a76dcf1a6 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4296,7 +4296,7 @@ "kind": "field", "name": "cost_attribution_label", "required": false, - "desc": "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total.", + "desc": "Label used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.", "fieldValue": null, "fieldDefaultValue": "", "fieldFlag": "validation.cost-attribution-label", @@ -4307,7 +4307,7 @@ "kind": "field", "name": "max_cost_attribution_per_user", "required": false, - "desc": "The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution.", + "desc": "Maximum number of cost attribution labels allowed per user.", "fieldValue": null, "fieldDefaultValue": 0, "fieldFlag": "validation.max-cost-attribution-per-user", @@ -18176,7 +18176,7 @@ "kind": "field", "name": "cost_attribution_eviction_interval", "required": false, - "desc": "Interval at which to evict inactive cost attributions.", + "desc": "Time interval at which inactive cost attributions will be evicted from the cache.", "fieldValue": null, "fieldDefaultValue": 1800000000000, "fieldFlag": "cost-attribution-eviction-interval", @@ -18187,7 +18187,7 @@ "kind": "field", "name": "cost_attribution_cool_down_duration", "required": false, - "desc": "Duration for which to keep the cost attribution active after the last update.", + "desc": "Duration during which any cost attribution for a user will be marked as __unaccounted__ after exceeding the specified limit, prior to resetting the cache.", "fieldValue": null, "fieldDefaultValue": 1200000000000, "fieldFlag": "cost-attribution-cool-down-duration", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 7b7e466c26d..a59304f4b68 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1134,9 +1134,9 @@ Usage of ./cmd/mimir/mimir: -config.file value Configuration file to load. -cost-attribution-cool-down-duration duration - [experimental] Duration for which to keep the cost attribution active after the last update. (default 20m0s) + [experimental] Duration during which any cost attribution for a user will be marked as __unaccounted__ after exceeding the specified limit, prior to resetting the cache. (default 20m0s) -cost-attribution-eviction-interval duration - [experimental] Interval at which to evict inactive cost attributions. (default 30m0s) + [experimental] Time interval at which inactive cost attributions will be evicted from the cache. (default 30m0s) -custom-registry-path string Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int @@ -3066,13 +3066,13 @@ Usage of ./cmd/mimir/mimir: -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") -validation.cost-attribution-label string - [experimental] Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. + [experimental] Label used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) -validation.max-cost-attribution-per-user int - [experimental] The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution. + [experimental] Maximum number of cost attribution labels allowed per user. -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/pkg/costattribution/caimpl/managerImpl.go b/pkg/costattribution/caimpl/managerImpl.go index a7e905544e9..d22a22a6ae6 100644 --- a/pkg/costattribution/caimpl/managerImpl.go +++ b/pkg/costattribution/caimpl/managerImpl.go @@ -68,18 +68,20 @@ func (m *ManagerImpl) GetUserAttributionLimit(userID string) int { return 0 } -func (m *ManagerImpl) UpdateAttributionTimestamp(user string, lbs labels.Labels, now time.Time) string { +func (m *ManagerImpl) UpdateAttributionTimestamp(user string, calb string, lbs labels.Labels, now time.Time) (bool, string) { // if cost attribution is not enabled for the user, return empty string if !m.EnabledForUser(user) { m.attributionTracker.deleteUserTracerFromCache(user) - return "" + return false, "" } // when cost attribution is enabled, the label has to be set. the cache would be updated with the label lb := m.attributionTracker.getUserAttributionLabelFromCache(user) // this should not happened, if user is enabled for cost attribution, the label has to be set + + isUpdated := calb != lb if lb == "" { - return "" + return isUpdated, "" } val := lbs.Get(lb) @@ -88,7 +90,7 @@ func (m *ManagerImpl) UpdateAttributionTimestamp(user string, lbs labels.Labels, level.Error(m.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since user has reached the limit of cost attribution labels", m.invalidValue)) } m.attributionTracker.updateAttributionCacheForUser(user, lb, val, now) - return val + return isUpdated, val } // SetActiveSeries adjust the input attribution and sets the active series gauge for the given user and attribution diff --git a/pkg/costattribution/caimpl/managerImpl_test.go b/pkg/costattribution/caimpl/managerImpl_test.go index d22f50cde20..0250ff4f0b5 100644 --- a/pkg/costattribution/caimpl/managerImpl_test.go +++ b/pkg/costattribution/caimpl/managerImpl_test.go @@ -75,13 +75,15 @@ func Test_UpdateAttributionTimestamp(t *testing.T) { tm1, tm2, tm3 := "bar", "foo", "baz" t.Run("Should update the timestamp when limit not reached for the user attribution", func(t *testing.T) { lbls.Set("department", tm1) - result := manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(0, 0)) + isOutdated, result := manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(0, 0)) + assert.False(t, isOutdated, "Expected label to be the same as the one in the cache") assert.Equal(t, tm1, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm1]) assert.Equal(t, int64(0), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm1].Load()) lbls.Set("department", tm2) - result = manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(1, 0)) + isOutdated, result = manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(1, 0)) + assert.False(t, isOutdated) assert.Equal(t, tm2, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm2]) assert.Equal(t, int64(1), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[tm2].Load()) @@ -89,13 +91,15 @@ func Test_UpdateAttributionTimestamp(t *testing.T) { t.Run("Should only update the timestamp of invalide when limit reached for the user attribution", func(t *testing.T) { lbls.Set("department", tm3) - result := manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(2, 0)) + isOutdated, result := manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(2, 0)) + assert.False(t, isOutdated) assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[manager.invalidValue]) assert.Equal(t, int64(2), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[manager.invalidValue].Load()) lbls.Set("department", tm1) - result = manager.UpdateAttributionTimestamp("user3", lbls.Labels(), time.Unix(3, 0)) + isOutdated, result = manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(3, 0)) + assert.False(t, isOutdated) assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") assert.Equal(t, int64(3), manager.attributionTracker.trackersByUserID["user3"].attributionTimestamps[manager.invalidValue].Load()) }) @@ -112,7 +116,8 @@ func Test_SetActiveSeries(t *testing.T) { t.Run("Should set the active series gauge for the given user and attribution", func(t *testing.T) { lbls.Set("team", "foo") - val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + isOutdated, val := manager.UpdateAttributionTimestamp(userID, "team", lbls.Labels(), time.Unix(0, 0)) + assert.False(t, isOutdated) manager.SetActiveSeries(userID, val, 1.0) expectedMetrics := ` # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. @@ -128,11 +133,13 @@ func Test_SetActiveSeries(t *testing.T) { t.Run("Should set the active series gauge for all users and attributions enabled and ignore disabled user", func(t *testing.T) { userID = "user3" lbls.Set("department", "bar") - val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) + assert.False(t, isOutdated) manager.SetActiveSeries(userID, val, 2.0) lbls.Set("department", "baz") - val = manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + isOutdated, val = manager.UpdateAttributionTimestamp(userID, "team", lbls.Labels(), time.Unix(0, 0)) + assert.True(t, isOutdated) manager.SetActiveSeries(userID, val, 3.0) expectedMetrics := ` @@ -161,7 +168,8 @@ func Test_SetActiveSeries(t *testing.T) { }, })) manager.attributionTracker.limits = overrides - val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(5, 0)) + isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(5, 0)) + assert.False(t, isOutdated) manager.SetActiveSeries(userID, val, 3.0) expectedMetrics := ` @@ -178,7 +186,8 @@ func Test_SetActiveSeries(t *testing.T) { t.Run("Should ignore setting the active series gauge for disabled user", func(t *testing.T) { userID = "user2" lbls.Set("department", "bar") - val := manager.UpdateAttributionTimestamp(userID, lbls.Labels(), time.Unix(0, 0)) + isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) + assert.False(t, isOutdated) manager.SetActiveSeries(userID, val, 4.0) expectedMetrics := ` diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 60c2b92517c..11b681ec791 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -14,7 +14,7 @@ type Manager interface { EnabledForUser(userID string) bool GetUserAttributionLabel(userID string) string GetUserAttributionLimit(userID string) int - UpdateAttributionTimestamp(user string, lbs labels.Labels, now time.Time) string + UpdateAttributionTimestamp(user string, calb string, lbs labels.Labels, now time.Time) (bool, string) SetActiveSeries(userID, attribution string, value float64) IncrementDiscardedSamples(userID, attribution string, value float64) IncrementReceivedSamples(userID, attribution string, value float64) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 7fdeb4a3622..a7df37a57ab 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1434,9 +1434,7 @@ func (d *Distributor) push(ctx context.Context, pushReq *Request) error { return err } - now := mtime.Now() - - d.updateReceivedMetrics(req, userID, now) + d.updateReceivedMetrics(req, userID) if len(req.Timeseries) == 0 && len(req.Metadata) == 0 { return nil @@ -1667,12 +1665,15 @@ func tokenForMetadata(userID string, metricName string) uint32 { return mimirpb.ShardByMetricName(userID, metricName) } -func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string, now time.Time) { +func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { + now := mtime.Now() var receivedSamples, receivedExemplars, receivedMetadata int costattributionLimit := 0 caEnabled := d.costAttributionMng != nil && d.costAttributionMng.EnabledForUser(userID) + caLabel := "" if caEnabled { costattributionLimit = d.costAttributionMng.GetUserAttributionLimit(userID) + caLabel = d.costAttributionMng.GetUserAttributionLabel(userID) } costAttribution := make(map[string]int, costattributionLimit) @@ -1680,7 +1681,12 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) if caEnabled { - attribution := d.costAttributionMng.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), now) + isKeyOutdated, attribution := d.costAttributionMng.UpdateAttributionTimestamp(userID, caLabel, mimirpb.FromLabelAdaptersToLabels(ts.Labels), now) + if isKeyOutdated { + // If the key is outdated, we need to reset cost attribution cache and update cost attribution label + costAttribution = make(map[string]int, costattributionLimit) + caLabel = d.costAttributionMng.GetUserAttributionLabel(userID) + } costAttribution[attribution]++ } } diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 56abe4f4527..79f726bcd02 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -81,6 +81,7 @@ type seriesStripe struct { // here the attribution values map, it maps the attribute value to its index, so we can increment the counter directly, // so in each entry, we keep the index of the value only, instead of keeping the string value costAttributionValues map[string]uint32 + caLabel string } // seriesEntry holds a timestamp for single series. @@ -90,6 +91,7 @@ type seriesEntry struct { numNativeHistogramBuckets int // Number of buckets in native histogram series, -1 if not a native histogram. // keep the value corresponding the label configured in serieStripe deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. + calabel string attributionValue string } @@ -241,6 +243,14 @@ func (c *ActiveSeries) ActiveByAttributionValue() map[string]uint32 { return total } +func (c *ActiveSeries) ResetAttribution() { + for s := 0; s < numStripes; s++ { + c.stripes[s].mu.Lock() + c.stripes[s].costAttributionValues = map[string]uint32{} + c.stripes[s].mu.Unlock() + } +} + func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { stripeID := storage.SeriesRef(ref) % numStripes c.stripes[stripeID].remove(storage.SeriesRef(ref)) @@ -426,9 +436,15 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef // here if we have a cost attribution label, we can split the serie count based on the value of the label // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly - if s.costAttributionMng != nil && s.costAttributionMng.GetUserAttributionLabel(s.userID) != "" { - attributionValue := s.costAttributionMng.UpdateAttributionTimestamp(s.userID, series, time.Unix(0, nowNanos)) + if s.costAttributionMng != nil && s.costAttributionMng.EnabledForUser(s.userID) { + isOutDated, attributionValue := s.costAttributionMng.UpdateAttributionTimestamp(s.userID, s.caLabel, series, time.Unix(0, nowNanos)) + if isOutDated { + // if the label is outdated, we need to remove the reference to the old value + s.costAttributionValues = map[string]uint32{} + s.caLabel = s.costAttributionMng.GetUserAttributionLabel(s.userID) + } s.costAttributionValues[attributionValue]++ + e.attributionValue = s.caLabel e.attributionValue = attributionValue } @@ -515,7 +531,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeNativeHistogramBuckets += uint32(entry.numNativeHistogramBuckets) } // restore the cost attribution by attribution value - if entry.attributionValue != "" { + if entry.attributionValue != "" && entry.calabel == s.caLabel { s.costAttributionValues[entry.attributionValue]++ } ml := entry.matches.Len() @@ -556,7 +572,7 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { } s.active-- - if entry.attributionValue != "" { + if entry.attributionValue != "" && s.caLabel == entry.calabel { s.costAttributionValues[entry.attributionValue]-- } if entry.numNativeHistogramBuckets >= 0 { diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index a93753fac7f..35a856383ee 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -331,6 +331,10 @@ type Ingester struct { usersMetadataMtx sync.RWMutex usersMetadata map[string]*userMetricsMetadata + // For storing tenant current cost attribution labels. + costAttributionMtx sync.RWMutex + costAttributionlbs map[string]string + // Rate of pushed samples. Used to limit global samples push rate. ingestionRate *util_math.EwmaRate inflightPushRequests atomic.Int64 @@ -367,8 +371,9 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus limits: limits, logger: logger, - tsdbs: make(map[string]*userTSDB), - usersMetadata: make(map[string]*userMetricsMetadata), + tsdbs: make(map[string]*userTSDB), + usersMetadata: make(map[string]*userMetricsMetadata), + bucket: bucketClient, tsdbMetrics: newTSDBMetrics(registerer, logger), shipperMetrics: newShipperMetrics(registerer), @@ -953,6 +958,7 @@ type pushStats struct { succeededSamplesCount int failedSamplesCount int failedSamplesAttribution map[string]int + attributionLabel string succeededExemplarsCount int failedExemplarsCount int sampleOutOfBoundsCount int @@ -1301,12 +1307,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // Return true if handled as soft error, and we can ingest more series. // get the cost attribution value for the series caEnabled := i.isCostAttributionEnabledForUser(userID) - handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { + handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter, caValue string) bool { if caEnabled { - // get the label value and update the timestamp, - // if the cordianlity is reached or we are currently in cooldown period, function would returned __unaccounted__ - costAttrib := i.costAttributionMng.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(labels), startAppend) - stats.failedSamplesAttribution[costAttrib]++ + stats.failedSamplesAttribution[caValue]++ } stats.failedSamplesCount++ @@ -1412,11 +1415,17 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels + isOutDated := false for _, ts := range timeseries { - var costAttrib string + var caValue string // when cost attribution label is set if caEnabled { - costAttrib = i.costAttributionMng.UpdateAttributionTimestamp(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), startAppend) + isOutDated, caValue = i.costAttributionMng.UpdateAttributionTimestamp(userID, stats.attributionLabel, mimirpb.FromLabelAdaptersToLabels(ts.Labels), startAppend) + // if the cost attribution label is outdated, we need to reset the attribution counter + if isOutDated { + stats.attributionLabel = i.costAttributionMng.GetUserAttributionLabel(userID) + stats.failedSamplesAttribution = make(map[string]int, i.limits.MaxCostAttributionPerUser(userID)) + } } // The labels must be sorted (in our case, it's guaranteed a write request @@ -1435,7 +1444,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleOutOfBoundsCount += len(ts.Samples) + len(ts.Histograms) if caEnabled { - stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) + len(ts.Histograms) + stats.failedSamplesAttribution[caValue] += len(ts.Samples) + len(ts.Histograms) } var firstTimestamp int64 if len(ts.Samples) > 0 { @@ -1458,7 +1467,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) stats.sampleOutOfBoundsCount += len(ts.Samples) if caEnabled { - stats.failedSamplesAttribution[costAttrib] += len(ts.Samples) + stats.failedSamplesAttribution[caValue] += len(ts.Samples) } firstTimestamp := ts.Samples[0].TimestampMs @@ -1484,10 +1493,10 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // Ensure the sample is not too far in the future. if s.TimestampMs > maxTimestampMs { - handleAppendError(globalerror.SampleTooFarInFuture, s.TimestampMs, ts.Labels) + handleAppendError(globalerror.SampleTooFarInFuture, s.TimestampMs, ts.Labels, caValue) continue } else if s.TimestampMs < minTimestampMs { - handleAppendError(globalerror.SampleTooFarInPast, s.TimestampMs, ts.Labels) + handleAppendError(globalerror.SampleTooFarInPast, s.TimestampMs, ts.Labels, caValue) continue } @@ -1509,7 +1518,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } // If it's a soft error it will be returned back to the distributor later as a 400. - if handleAppendError(err, s.TimestampMs, ts.Labels) { + if handleAppendError(err, s.TimestampMs, ts.Labels, caValue) { continue } @@ -1527,10 +1536,10 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre ) if h.Timestamp > maxTimestampMs { - handleAppendError(globalerror.SampleTooFarInFuture, h.Timestamp, ts.Labels) + handleAppendError(globalerror.SampleTooFarInFuture, h.Timestamp, ts.Labels, caValue) continue } else if h.Timestamp < minTimestampMs { - handleAppendError(globalerror.SampleTooFarInPast, h.Timestamp, ts.Labels) + handleAppendError(globalerror.SampleTooFarInPast, h.Timestamp, ts.Labels, caValue) continue } @@ -1557,7 +1566,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } } - if handleAppendError(err, h.Timestamp, ts.Labels) { + if handleAppendError(err, h.Timestamp, ts.Labels, caValue) { continue } diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index c7459d1c171..79b472e1b92 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -172,8 +172,8 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.StringVar(&c.NoAuthTenant, "auth.no-auth-tenant", "anonymous", "Tenant ID to use when multitenancy is disabled.") f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.") f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Mimir will report not-ready status via /ready endpoint.") - f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 30*time.Minute, "Interval at which to evict inactive cost attributions.") - f.DurationVar(&c.CostAttributionCoolDownDuration, "cost-attribution-cool-down-duration", 20*time.Minute, "Duration for which to keep the cost attribution active after the last update.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 30*time.Minute, "Time interval at which inactive cost attributions will be evicted from the cache.") + f.DurationVar(&c.CostAttributionCoolDownDuration, "cost-attribution-cool-down-duration", 20*time.Minute, "Duration during which any cost attribution for a user will be marked as __unaccounted__ after exceeding the specified limit, prior to resetting the cache.") f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index b2e44a8074c..339c637736e 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -286,8 +286,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&l.OutOfOrderBlocksExternalLabelEnabled, "ingester.out-of-order-blocks-external-label-enabled", false, "Whether the shipper should label out-of-order blocks with an external label before uploading them. Setting this label will compact out-of-order blocks separately from non-out-of-order blocks") f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") - f.StringVar(&l.CostAttributionLabel, "validation.cost-attribution-label", "", "Label used to define the cost attribution label. For each write request, the cost attribution is obtained from the first non-empty cost attribution label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'attrib' cost attribution's label's value. Applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total.") - f.IntVar(&l.MaxCostAttributionPerUser, "validation.max-cost-attribution-per-user", 0, "The maximum number of cost attribution labels per user, across the cluster. 0 to disable cost attribution.") + f.StringVar(&l.CostAttributionLabel, "validation.cost-attribution-label", "", "Label used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.") + f.IntVar(&l.MaxCostAttributionPerUser, "validation.max-cost-attribution-per-user", 0, "Maximum number of cost attribution labels allowed per user.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") From 5207d8d24f28ae227945e8f582c67cc5a221cb97 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 7 Oct 2024 19:21:08 +0200 Subject: [PATCH 09/13] add label checks --- pkg/costattribution/caimpl/managerImpl.go | 19 ++++++++++++++++--- .../caimpl/managerImpl_test.go | 14 +++++++------- pkg/costattribution/caimpl/tracker.go | 3 +++ pkg/costattribution/caimpl/tracker_test.go | 4 ---- pkg/costattribution/manager.go | 6 +++--- pkg/distributor/distributor.go | 4 ++-- pkg/ingester/activeseries/active_series.go | 16 +++++----------- pkg/ingester/ingester.go | 17 +++++++---------- 8 files changed, 43 insertions(+), 40 deletions(-) diff --git a/pkg/costattribution/caimpl/managerImpl.go b/pkg/costattribution/caimpl/managerImpl.go index d22a22a6ae6..038fd01f04f 100644 --- a/pkg/costattribution/caimpl/managerImpl.go +++ b/pkg/costattribution/caimpl/managerImpl.go @@ -94,7 +94,11 @@ func (m *ManagerImpl) UpdateAttributionTimestamp(user string, calb string, lbs l } // SetActiveSeries adjust the input attribution and sets the active series gauge for the given user and attribution -func (m *ManagerImpl) SetActiveSeries(userID, attribution string, value float64) { +func (m *ManagerImpl) SetActiveSeries(userID, calb, attribution string, value float64) { + // if the input label is outdated, we skip the update + if calb != m.GetUserAttributionLabel(userID) { + return + } attribution = m.adjustUserAttribution(userID, attribution) m.attributionTracker.mu.Lock() @@ -105,7 +109,12 @@ func (m *ManagerImpl) SetActiveSeries(userID, attribution string, value float64) } // IncrementDiscardedSamples increments the discarded samples counter for a given user and attribution -func (m *ManagerImpl) IncrementDiscardedSamples(userID, attribution string, value float64) { +func (m *ManagerImpl) IncrementDiscardedSamples(userID, calb, attribution string, value float64) { + // if the input label is outdated, we skip the update + if calb != m.GetUserAttributionLabel(userID) { + return + } + attribution = m.adjustUserAttribution(userID, attribution) m.attributionTracker.mu.RLock() defer m.attributionTracker.mu.RUnlock() @@ -115,7 +124,11 @@ func (m *ManagerImpl) IncrementDiscardedSamples(userID, attribution string, valu } // IncrementReceivedSamples increments the received samples counter for a given user and attribution -func (m *ManagerImpl) IncrementReceivedSamples(userID, attribution string, value float64) { +func (m *ManagerImpl) IncrementReceivedSamples(userID, calb, attribution string, value float64) { + // if the input label is outdated, we skip the update + if calb != m.GetUserAttributionLabel(userID) { + return + } attribution = m.adjustUserAttribution(userID, attribution) m.attributionTracker.mu.RLock() defer m.attributionTracker.mu.RUnlock() diff --git a/pkg/costattribution/caimpl/managerImpl_test.go b/pkg/costattribution/caimpl/managerImpl_test.go index 0250ff4f0b5..ef8a7ca835b 100644 --- a/pkg/costattribution/caimpl/managerImpl_test.go +++ b/pkg/costattribution/caimpl/managerImpl_test.go @@ -118,7 +118,7 @@ func Test_SetActiveSeries(t *testing.T) { lbls.Set("team", "foo") isOutdated, val := manager.UpdateAttributionTimestamp(userID, "team", lbls.Labels(), time.Unix(0, 0)) assert.False(t, isOutdated) - manager.SetActiveSeries(userID, val, 1.0) + manager.SetActiveSeries(userID, "team", val, 1.0) expectedMetrics := ` # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. # TYPE cortex_ingester_active_series_attribution gauge @@ -135,12 +135,12 @@ func Test_SetActiveSeries(t *testing.T) { lbls.Set("department", "bar") isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) assert.False(t, isOutdated) - manager.SetActiveSeries(userID, val, 2.0) + manager.SetActiveSeries(userID, "department", val, 2.0) lbls.Set("department", "baz") - isOutdated, val = manager.UpdateAttributionTimestamp(userID, "team", lbls.Labels(), time.Unix(0, 0)) - assert.True(t, isOutdated) - manager.SetActiveSeries(userID, val, 3.0) + isOutdated, val = manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(1, 0)) + assert.False(t, isOutdated) + manager.SetActiveSeries(userID, "department", val, 3.0) expectedMetrics := ` # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. @@ -170,7 +170,7 @@ func Test_SetActiveSeries(t *testing.T) { manager.attributionTracker.limits = overrides isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(5, 0)) assert.False(t, isOutdated) - manager.SetActiveSeries(userID, val, 3.0) + manager.SetActiveSeries(userID, val, "department", 3.0) expectedMetrics := ` # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. @@ -188,7 +188,7 @@ func Test_SetActiveSeries(t *testing.T) { lbls.Set("department", "bar") isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) assert.False(t, isOutdated) - manager.SetActiveSeries(userID, val, 4.0) + manager.SetActiveSeries(userID, val, "department", 4.0) expectedMetrics := ` # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. diff --git a/pkg/costattribution/caimpl/tracker.go b/pkg/costattribution/caimpl/tracker.go index 255b6413907..423e9df5a5e 100644 --- a/pkg/costattribution/caimpl/tracker.go +++ b/pkg/costattribution/caimpl/tracker.go @@ -36,14 +36,17 @@ func newTracker(trackedLabel string, limit int) (*Tracker, error) { attributionLimit: limit, attributionTimestamps: map[string]*atomic.Int64{}, coolDownDeadline: atomic.NewInt64(0), + //nolint:faillint // the metrics are registered in the mimir package discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cortex_discarded_samples_attribution_total", Help: "The total number of samples that were discarded per attribution.", }, []string{"user", trackedLabel}), + //nolint:faillint receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cortex_received_samples_attribution_total", Help: "The total number of samples that were received per attribution.", }, []string{"user", trackedLabel}), + //nolint:faillint activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_ingester_active_series_attribution", Help: "The total number of active series per user and attribution.", diff --git a/pkg/costattribution/caimpl/tracker_test.go b/pkg/costattribution/caimpl/tracker_test.go index 7427e6a1540..0af76642cd3 100644 --- a/pkg/costattribution/caimpl/tracker_test.go +++ b/pkg/costattribution/caimpl/tracker_test.go @@ -48,8 +48,4 @@ func Test_NewTracker(t *testing.T) { // Clean the tracker for the user attribution tracker.cleanupTrackerAttribution(userID, attribution) - - // Verify that metrics have been cleaned - expectedMetrics = `` - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) } diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 11b681ec791..18261327a38 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -15,9 +15,9 @@ type Manager interface { GetUserAttributionLabel(userID string) string GetUserAttributionLimit(userID string) int UpdateAttributionTimestamp(user string, calb string, lbs labels.Labels, now time.Time) (bool, string) - SetActiveSeries(userID, attribution string, value float64) - IncrementDiscardedSamples(userID, attribution string, value float64) - IncrementReceivedSamples(userID, attribution string, value float64) + SetActiveSeries(userID, calb string, attribution string, value float64) + IncrementDiscardedSamples(userID, calb string, attribution string, value float64) + IncrementReceivedSamples(userID, calb string, attribution string, value float64) Collect(out chan<- prometheus.Metric) Describe(chan<- *prometheus.Desc) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index a7df37a57ab..e0e86c0facb 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1692,8 +1692,8 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st } receivedMetadata = len(req.Metadata) if caEnabled { - for lv, count := range costAttribution { - d.costAttributionMng.IncrementReceivedSamples(userID, lv, float64(count)) + for value, count := range costAttribution { + d.costAttributionMng.IncrementReceivedSamples(userID, caLabel, value, float64(count)) } } d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 79f726bcd02..17574661f52 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -231,26 +231,20 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } -func (c *ActiveSeries) ActiveByAttributionValue() map[string]uint32 { +func (c *ActiveSeries) ActiveByAttributionValue(calb string) map[string]uint32 { total := make(map[string]uint32, c.costAttributionMng.GetUserAttributionLimit(c.userID)) for s := 0; s < numStripes; s++ { c.stripes[s].mu.RLock() - for k, v := range c.stripes[s].costAttributionValues { - total[k] += v + if c.stripes[s].caLabel == calb { + for k, v := range c.stripes[s].costAttributionValues { + total[k] += v + } } c.stripes[s].mu.RUnlock() } return total } -func (c *ActiveSeries) ResetAttribution() { - for s := 0; s < numStripes; s++ { - c.stripes[s].mu.Lock() - c.stripes[s].costAttributionValues = map[string]uint32{} - c.stripes[s].mu.Unlock() - } -} - func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { stripeID := storage.SeriesRef(ref) % numStripes c.stripes[stripeID].remove(storage.SeriesRef(ref)) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 35a856383ee..8e95d396355 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -331,10 +331,6 @@ type Ingester struct { usersMetadataMtx sync.RWMutex usersMetadata map[string]*userMetricsMetadata - // For storing tenant current cost attribution labels. - costAttributionMtx sync.RWMutex - costAttributionlbs map[string]string - // Rate of pushed samples. Used to limit global samples push rate. ingestionRate *util_math.EwmaRate inflightPushRequests atomic.Int64 @@ -794,14 +790,15 @@ func (i *Ingester) updateActiveSeries(now time.Time) { i.metrics.activeSeriesLoading.DeleteLabelValues(userID) if allActive > 0 { if i.isCostAttributionEnabledForUser(userID) { - labelAttributions := userDB.activeSeries.ActiveByAttributionValue() - for label, count := range labelAttributions { - i.costAttributionMng.SetActiveSeries(userID, label, float64(count)) + calb := i.costAttributionMng.GetUserAttributionLabel(userID) + labelAttributions := userDB.activeSeries.ActiveByAttributionValue(calb) + for value, count := range labelAttributions { + i.costAttributionMng.SetActiveSeries(userID, calb, value, float64(count)) } } i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(allActive)) } else { - i.metrics.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) + i.metrics.activeSeriesPerUser.DeleteLabelValues(userID) } if allActiveHistograms > 0 { i.metrics.activeSeriesPerUserNativeHistograms.WithLabelValues(userID).Set(float64(allActiveHistograms)) @@ -1288,8 +1285,8 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats } } if i.isCostAttributionEnabledForUser(userID) { - for label, count := range stats.failedSamplesAttribution { - i.costAttributionMng.IncrementDiscardedSamples(userID, label, float64(count)) + for value, count := range stats.failedSamplesAttribution { + i.costAttributionMng.IncrementDiscardedSamples(userID, stats.attributionLabel, value, float64(count)) } } } From 828f70ef7bd04857df7b6ff4c9456d1c4a923c33 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 9 Oct 2024 14:58:41 +0200 Subject: [PATCH 10/13] fix assigned label --- pkg/ingester/activeseries/active_series.go | 5 ++--- pkg/ingester/metrics.go | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 17574661f52..8330c44b055 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -78,8 +78,7 @@ type seriesStripe struct { activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. userID string - // here the attribution values map, it maps the attribute value to its index, so we can increment the counter directly, - // so in each entry, we keep the index of the value only, instead of keeping the string value + // here we keep the counter of cost attribution for seriesStripe costAttributionValues map[string]uint32 caLabel string } @@ -438,7 +437,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef s.caLabel = s.costAttributionMng.GetUserAttributionLabel(s.userID) } s.costAttributionValues[attributionValue]++ - e.attributionValue = s.caLabel + e.calabel = s.caLabel e.attributionValue = attributionValue } diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index e960cf8e206..833a802fb04 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -409,8 +409,7 @@ func (m *ingesterMetrics) deletePerGroupMetricsForUser(userID, group string) { func (m *ingesterMetrics) deletePerUserCustomTrackerMetrics(userID string, customTrackerMetrics []string) { m.activeSeriesLoading.DeleteLabelValues(userID) - m.activeSeriesPerUser.DeletePartialMatch(prometheus.Labels{"user": userID}) - + m.activeSeriesPerUser.DeleteLabelValues(userID) m.activeSeriesPerUserNativeHistograms.DeleteLabelValues(userID) m.activeNativeHistogramBucketsPerUser.DeleteLabelValues(userID) for _, name := range customTrackerMetrics { From 0db7db7a87ba706f1cc70b4e8acec16f6efd3e7c Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 9 Oct 2024 16:33:22 +0200 Subject: [PATCH 11/13] remove subpackages --- pkg/costattribution/caimpl/managerImpl.go | 158 ------------------ pkg/costattribution/manager.go | 154 +++++++++++++++-- .../managerImpl_test.go => manager_test.go} | 4 +- pkg/costattribution/{caimpl => }/tracker.go | 2 +- .../{caimpl => }/tracker_group.go | 2 +- .../{caimpl => }/tracker_group_test.go | 2 +- .../{caimpl => }/tracker_test.go | 2 +- pkg/distributor/distributor.go | 4 +- pkg/ingester/activeseries/active_series.go | 9 +- pkg/ingester/ingester.go | 4 +- pkg/mimir/mimir.go | 2 +- 11 files changed, 160 insertions(+), 183 deletions(-) delete mode 100644 pkg/costattribution/caimpl/managerImpl.go rename pkg/costattribution/{caimpl/managerImpl_test.go => manager_test.go} (99%) rename pkg/costattribution/{caimpl => }/tracker.go (99%) rename pkg/costattribution/{caimpl => }/tracker_group.go (99%) rename pkg/costattribution/{caimpl => }/tracker_group_test.go (99%) rename pkg/costattribution/{caimpl => }/tracker_test.go (98%) diff --git a/pkg/costattribution/caimpl/managerImpl.go b/pkg/costattribution/caimpl/managerImpl.go deleted file mode 100644 index 038fd01f04f..00000000000 --- a/pkg/costattribution/caimpl/managerImpl.go +++ /dev/null @@ -1,158 +0,0 @@ -package caimpl - -import ( - "context" - "fmt" - "time" - - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/grafana/dskit/services" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/prometheus/model/labels" - - "github.com/grafana/mimir/pkg/util/validation" -) - -type ManagerImpl struct { - services.Service - logger log.Logger - attributionTracker *AttributionTrackerGroup - inactiveTimeout time.Duration - invalidValue string -} - -// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. -// It will clean up inactive series and update the cost attribution of series every 3 minutes. -func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *ManagerImpl { - s := &ManagerImpl{ - attributionTracker: newAttributionTrackerGroup(limits, cooldownTimeout), - inactiveTimeout: inactiveTimeout, - logger: logger, - invalidValue: "__unaccounted__", - } - - s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager") - return s -} - -func (m *ManagerImpl) iteration(_ context.Context) error { - m.attributionTracker.purgeInactiveAttributions(m.inactiveTimeout) - return nil -} - -// EnabledForUser returns true if the cost attribution is enabled for the user -func (m *ManagerImpl) EnabledForUser(userID string) bool { - return m.attributionTracker.limits.CostAttributionLabel(userID) != "" -} - -// GetUserAttributionLabel returns the cost attribution label for the user, first it will try to get the label from the cache, -// If not found, it will get the label from the config -// If the user is not enabled for cost attribution, it would clean the cache and return empty string -func (m *ManagerImpl) GetUserAttributionLabel(userID string) string { - if m.EnabledForUser(userID) { - return m.attributionTracker.getUserAttributionLabelFromCache(userID) - } - m.attributionTracker.deleteUserTracerFromCache(userID) - return "" -} - -// GetUserAttributionLimit returns the cost attribution limit for the user, first it will try to get the limit from the cache, -// If not found, it will get the limit from the config -// If the user is not enabled for cost attribution, it would clean the cache and return 0 -func (m *ManagerImpl) GetUserAttributionLimit(userID string) int { - if m.EnabledForUser(userID) { - return m.attributionTracker.getUserAttributionLimitFromCache(userID) - } - m.attributionTracker.deleteUserTracerFromCache(userID) - return 0 -} - -func (m *ManagerImpl) UpdateAttributionTimestamp(user string, calb string, lbs labels.Labels, now time.Time) (bool, string) { - // if cost attribution is not enabled for the user, return empty string - if !m.EnabledForUser(user) { - m.attributionTracker.deleteUserTracerFromCache(user) - return false, "" - } - - // when cost attribution is enabled, the label has to be set. the cache would be updated with the label - lb := m.attributionTracker.getUserAttributionLabelFromCache(user) - // this should not happened, if user is enabled for cost attribution, the label has to be set - - isUpdated := calb != lb - if lb == "" { - return isUpdated, "" - } - val := lbs.Get(lb) - - if m.attributionTracker.attributionLimitExceeded(user, val, now) { - val = m.invalidValue - level.Error(m.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since user has reached the limit of cost attribution labels", m.invalidValue)) - } - m.attributionTracker.updateAttributionCacheForUser(user, lb, val, now) - return isUpdated, val -} - -// SetActiveSeries adjust the input attribution and sets the active series gauge for the given user and attribution -func (m *ManagerImpl) SetActiveSeries(userID, calb, attribution string, value float64) { - // if the input label is outdated, we skip the update - if calb != m.GetUserAttributionLabel(userID) { - return - } - attribution = m.adjustUserAttribution(userID, attribution) - - m.attributionTracker.mu.Lock() - defer m.attributionTracker.mu.Unlock() - if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { - tracker.activeSeriesPerUserAttribution.WithLabelValues(userID, attribution).Set(value) - } -} - -// IncrementDiscardedSamples increments the discarded samples counter for a given user and attribution -func (m *ManagerImpl) IncrementDiscardedSamples(userID, calb, attribution string, value float64) { - // if the input label is outdated, we skip the update - if calb != m.GetUserAttributionLabel(userID) { - return - } - - attribution = m.adjustUserAttribution(userID, attribution) - m.attributionTracker.mu.RLock() - defer m.attributionTracker.mu.RUnlock() - if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { - tracker.discardedSampleAttribution.WithLabelValues(userID, attribution).Add(value) - } -} - -// IncrementReceivedSamples increments the received samples counter for a given user and attribution -func (m *ManagerImpl) IncrementReceivedSamples(userID, calb, attribution string, value float64) { - // if the input label is outdated, we skip the update - if calb != m.GetUserAttributionLabel(userID) { - return - } - attribution = m.adjustUserAttribution(userID, attribution) - m.attributionTracker.mu.RLock() - defer m.attributionTracker.mu.RUnlock() - if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { - tracker.receivedSamplesAttribution.WithLabelValues(userID, attribution).Add(value) - } -} - -func (m *ManagerImpl) adjustUserAttribution(userID, attribution string) string { - if m.attributionTracker.attributionLimitExceeded(userID, attribution, time.Now()) { - return m.invalidValue - } - return attribution -} - -func (m *ManagerImpl) Collect(out chan<- prometheus.Metric) { - m.attributionTracker.mu.RLock() - defer m.attributionTracker.mu.RUnlock() - for _, tracker := range m.attributionTracker.trackersByUserID { - tracker.Collect(out) - } -} - -// Describe implements prometheus.Collector. -func (m *ManagerImpl) Describe(chan<- *prometheus.Desc) { - // this is an unchecked collector -} diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 18261327a38..7f19858b424 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -1,24 +1,158 @@ package costattribution import ( + "context" + "fmt" "time" + "github.com/go-kit/log" + "github.com/go-kit/log/level" "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/model/labels" + + "github.com/grafana/mimir/pkg/util/validation" ) -type Manager interface { +type Manager struct { services.Service + logger log.Logger + attributionTracker *AttributionTrackerGroup + inactiveTimeout time.Duration + invalidValue string +} + +// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. +// It will clean up inactive series and update the cost attribution of series every 3 minutes. +func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager { + s := &Manager{ + attributionTracker: newAttributionTrackerGroup(limits, cooldownTimeout), + inactiveTimeout: inactiveTimeout, + logger: logger, + invalidValue: "__unaccounted__", + } + + s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager") + return s +} + +func (m *Manager) iteration(_ context.Context) error { + m.attributionTracker.purgeInactiveAttributions(m.inactiveTimeout) + return nil +} + +// EnabledForUser returns true if the cost attribution is enabled for the user +func (m *Manager) EnabledForUser(userID string) bool { + return m.attributionTracker.limits.CostAttributionLabel(userID) != "" +} + +// GetUserAttributionLabel returns the cost attribution label for the user, first it will try to get the label from the cache, +// If not found, it will get the label from the config +// If the user is not enabled for cost attribution, it would clean the cache and return empty string +func (m *Manager) GetUserAttributionLabel(userID string) string { + if m.EnabledForUser(userID) { + return m.attributionTracker.getUserAttributionLabelFromCache(userID) + } + m.attributionTracker.deleteUserTracerFromCache(userID) + return "" +} + +// GetUserAttributionLimit returns the cost attribution limit for the user, first it will try to get the limit from the cache, +// If not found, it will get the limit from the config +// If the user is not enabled for cost attribution, it would clean the cache and return 0 +func (m *Manager) GetUserAttributionLimit(userID string) int { + if m.EnabledForUser(userID) { + return m.attributionTracker.getUserAttributionLimitFromCache(userID) + } + m.attributionTracker.deleteUserTracerFromCache(userID) + return 0 +} + +func (m *Manager) UpdateAttributionTimestamp(user string, calb string, lbs labels.Labels, now time.Time) (bool, string) { + // if cost attribution is not enabled for the user, return empty string + if !m.EnabledForUser(user) { + m.attributionTracker.deleteUserTracerFromCache(user) + return false, "" + } + + // when cost attribution is enabled, the label has to be set. the cache would be updated with the label + lb := m.attributionTracker.getUserAttributionLabelFromCache(user) + // this should not happened, if user is enabled for cost attribution, the label has to be set - EnabledForUser(userID string) bool - GetUserAttributionLabel(userID string) string - GetUserAttributionLimit(userID string) int - UpdateAttributionTimestamp(user string, calb string, lbs labels.Labels, now time.Time) (bool, string) - SetActiveSeries(userID, calb string, attribution string, value float64) - IncrementDiscardedSamples(userID, calb string, attribution string, value float64) - IncrementReceivedSamples(userID, calb string, attribution string, value float64) + isUpdated := calb != lb + if lb == "" { + return isUpdated, "" + } + val := lbs.Get(lb) + + if m.attributionTracker.attributionLimitExceeded(user, val, now) { + val = m.invalidValue + level.Error(m.logger).Log("msg", fmt.Sprintf("set attribution label to \"%s\" since user has reached the limit of cost attribution labels", m.invalidValue)) + } + m.attributionTracker.updateAttributionCacheForUser(user, lb, val, now) + return isUpdated, val +} + +// SetActiveSeries adjust the input attribution and sets the active series gauge for the given user and attribution +func (m *Manager) SetActiveSeries(userID, calb, attribution string, value float64) { + // if the input label is outdated, we skip the update + if calb != m.GetUserAttributionLabel(userID) { + return + } + attribution = m.adjustUserAttribution(userID, attribution) + + m.attributionTracker.mu.Lock() + defer m.attributionTracker.mu.Unlock() + if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { + tracker.activeSeriesPerUserAttribution.WithLabelValues(userID, attribution).Set(value) + } +} + +// IncrementDiscardedSamples increments the discarded samples counter for a given user and attribution +func (m *Manager) IncrementDiscardedSamples(userID, calb, attribution string, value float64) { + // if the input label is outdated, we skip the update + if calb != m.GetUserAttributionLabel(userID) { + return + } + + attribution = m.adjustUserAttribution(userID, attribution) + m.attributionTracker.mu.RLock() + defer m.attributionTracker.mu.RUnlock() + if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { + tracker.discardedSampleAttribution.WithLabelValues(userID, attribution).Add(value) + } +} + +// IncrementReceivedSamples increments the received samples counter for a given user and attribution +func (m *Manager) IncrementReceivedSamples(userID, calb, attribution string, value float64) { + // if the input label is outdated, we skip the update + if calb != m.GetUserAttributionLabel(userID) { + return + } + attribution = m.adjustUserAttribution(userID, attribution) + m.attributionTracker.mu.RLock() + defer m.attributionTracker.mu.RUnlock() + if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { + tracker.receivedSamplesAttribution.WithLabelValues(userID, attribution).Add(value) + } +} + +func (m *Manager) adjustUserAttribution(userID, attribution string) string { + if m.attributionTracker.attributionLimitExceeded(userID, attribution, time.Now()) { + return m.invalidValue + } + return attribution +} + +func (m *Manager) Collect(out chan<- prometheus.Metric) { + m.attributionTracker.mu.RLock() + defer m.attributionTracker.mu.RUnlock() + for _, tracker := range m.attributionTracker.trackersByUserID { + tracker.Collect(out) + } +} - Collect(out chan<- prometheus.Metric) - Describe(chan<- *prometheus.Desc) +// Describe implements prometheus.Collector. +func (m *Manager) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector } diff --git a/pkg/costattribution/caimpl/managerImpl_test.go b/pkg/costattribution/manager_test.go similarity index 99% rename from pkg/costattribution/caimpl/managerImpl_test.go rename to pkg/costattribution/manager_test.go index ef8a7ca835b..9108c467921 100644 --- a/pkg/costattribution/caimpl/managerImpl_test.go +++ b/pkg/costattribution/manager_test.go @@ -1,4 +1,4 @@ -package caimpl +package costattribution import ( "strings" @@ -15,7 +15,7 @@ import ( "github.com/grafana/mimir/pkg/util/validation" ) -func newTestManager() *ManagerImpl { +func newTestManager() *Manager { logger := log.NewNopLogger() limits, _ := validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(map[string]*validation.Limits{ "user1": { diff --git a/pkg/costattribution/caimpl/tracker.go b/pkg/costattribution/tracker.go similarity index 99% rename from pkg/costattribution/caimpl/tracker.go rename to pkg/costattribution/tracker.go index 423e9df5a5e..892a8ef8837 100644 --- a/pkg/costattribution/caimpl/tracker.go +++ b/pkg/costattribution/tracker.go @@ -1,6 +1,6 @@ // SPDX-License-Identifier: AGPL-3.0-only -package caimpl +package costattribution import ( "github.com/prometheus/client_golang/prometheus" diff --git a/pkg/costattribution/caimpl/tracker_group.go b/pkg/costattribution/tracker_group.go similarity index 99% rename from pkg/costattribution/caimpl/tracker_group.go rename to pkg/costattribution/tracker_group.go index 14214c7a876..ca845c00b01 100644 --- a/pkg/costattribution/caimpl/tracker_group.go +++ b/pkg/costattribution/tracker_group.go @@ -1,4 +1,4 @@ -package caimpl +package costattribution import ( "sync" diff --git a/pkg/costattribution/caimpl/tracker_group_test.go b/pkg/costattribution/tracker_group_test.go similarity index 99% rename from pkg/costattribution/caimpl/tracker_group_test.go rename to pkg/costattribution/tracker_group_test.go index 5fbe148ae77..9c7f8330097 100644 --- a/pkg/costattribution/caimpl/tracker_group_test.go +++ b/pkg/costattribution/tracker_group_test.go @@ -1,4 +1,4 @@ -package caimpl +package costattribution import ( "fmt" diff --git a/pkg/costattribution/caimpl/tracker_test.go b/pkg/costattribution/tracker_test.go similarity index 98% rename from pkg/costattribution/caimpl/tracker_test.go rename to pkg/costattribution/tracker_test.go index 0af76642cd3..c44a314ec79 100644 --- a/pkg/costattribution/caimpl/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -1,4 +1,4 @@ -package caimpl +package costattribution import ( "strings" diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index e0e86c0facb..7259e1d21a6 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -106,7 +106,7 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - costAttributionMng costattribution.Manager + costAttributionMng *costattribution.Manager // For handling HA replicas. HATracker *haTracker @@ -307,7 +307,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 8330c44b055..b998b45611d 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -50,7 +50,7 @@ type ActiveSeries struct { matchers *asmodel.Matchers lastMatchersUpdate time.Time - costAttributionMng costattribution.Manager + costAttributionMng *costattribution.Manager // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -68,7 +68,7 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - costAttributionMng costattribution.Manager + costAttributionMng *costattribution.Manager mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -98,7 +98,7 @@ func NewActiveSeries( asm *asmodel.Matchers, timeout time.Duration, userID string, - costAttributionMng costattribution.Manager, + costAttributionMng *costattribution.Manager, ) *ActiveSeries { c := &ActiveSeries{ matchers: asm, timeout: timeout, userID: userID, @@ -148,6 +148,7 @@ func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, c.stripes[deletedStripeID].remove(deleted.ref) } } + } // PostDeletion should be called when series are deleted from the head. @@ -468,7 +469,7 @@ func (s *seriesStripe) reinitialize( asm *asmodel.Matchers, deleted *deletedSeries, userID string, - costAttributionMng costattribution.Manager, + costAttributionMng *costattribution.Manager, ) { s.mu.Lock() defer s.mu.Unlock() diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 8e95d396355..7a645440a8c 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -311,7 +311,7 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService - costAttributionMng costattribution.Manager + costAttributionMng *costattribution.Manager tsdbMetrics *tsdbMetrics @@ -381,7 +381,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 79b472e1b92..7b27aeace15 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -716,7 +716,7 @@ type Mimir struct { TenantLimits validation.TenantLimits Overrides *validation.Overrides ActiveGroupsCleanup *util.ActiveGroupsCleanupService - CostAttributionManager costattribution.Manager + CostAttributionManager *costattribution.Manager Distributor *distributor.Distributor Ingester *ingester.Ingester From e55f83cabc5cbed3389bfd3ac6943cb06fc46efa Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 9 Oct 2024 17:29:34 +0200 Subject: [PATCH 12/13] make tracker and tracker group private --- pkg/costattribution/manager.go | 2 +- pkg/costattribution/tracker.go | 14 +++++++------- pkg/costattribution/tracker_group.go | 24 ++++++++++++------------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 7f19858b424..0aeae09fcd8 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -17,7 +17,7 @@ import ( type Manager struct { services.Service logger log.Logger - attributionTracker *AttributionTrackerGroup + attributionTracker *attributionTrackerGroup inactiveTimeout time.Duration invalidValue string } diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 892a8ef8837..44350018003 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -7,7 +7,7 @@ import ( "go.uber.org/atomic" ) -type Tracker struct { +type tracker struct { trackedLabel string attributionLimit int activeSeriesPerUserAttribution *prometheus.GaugeVec @@ -17,21 +17,21 @@ type Tracker struct { coolDownDeadline *atomic.Int64 } -func (t *Tracker) cleanupTrackerAttribution(userID, attribution string) { +func (t *tracker) cleanupTrackerAttribution(userID, attribution string) { t.activeSeriesPerUserAttribution.DeleteLabelValues(userID, attribution) t.receivedSamplesAttribution.DeleteLabelValues(userID, attribution) t.discardedSampleAttribution.DeleteLabelValues(userID, attribution) } -func (t *Tracker) cleanupTracker(userID string) { +func (t *tracker) cleanupTracker(userID string) { filter := prometheus.Labels{"user": userID} t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) t.receivedSamplesAttribution.DeletePartialMatch(filter) t.discardedSampleAttribution.DeletePartialMatch(filter) } -func newTracker(trackedLabel string, limit int) (*Tracker, error) { - m := &Tracker{ +func newTracker(trackedLabel string, limit int) (*tracker, error) { + m := &tracker{ trackedLabel: trackedLabel, attributionLimit: limit, attributionTimestamps: map[string]*atomic.Int64{}, @@ -55,13 +55,13 @@ func newTracker(trackedLabel string, limit int) (*Tracker, error) { return m, nil } -func (t *Tracker) Collect(out chan<- prometheus.Metric) { +func (t *tracker) Collect(out chan<- prometheus.Metric) { t.activeSeriesPerUserAttribution.Collect(out) t.receivedSamplesAttribution.Collect(out) t.discardedSampleAttribution.Collect(out) } // Describe implements prometheus.Collector. -func (t *Tracker) Describe(chan<- *prometheus.Desc) { +func (t *tracker) Describe(chan<- *prometheus.Desc) { // this is an unchecked collector } diff --git a/pkg/costattribution/tracker_group.go b/pkg/costattribution/tracker_group.go index ca845c00b01..95499ed58eb 100644 --- a/pkg/costattribution/tracker_group.go +++ b/pkg/costattribution/tracker_group.go @@ -9,16 +9,16 @@ import ( "github.com/grafana/mimir/pkg/util/validation" ) -type AttributionTrackerGroup struct { +type attributionTrackerGroup struct { mu sync.RWMutex - trackersByUserID map[string]*Tracker + trackersByUserID map[string]*tracker limits *validation.Overrides cooldownTimeout time.Duration } -func newAttributionTrackerGroup(limits *validation.Overrides, cooldownTimeout time.Duration) *AttributionTrackerGroup { - return &AttributionTrackerGroup{ - trackersByUserID: make(map[string]*Tracker), +func newAttributionTrackerGroup(limits *validation.Overrides, cooldownTimeout time.Duration) *attributionTrackerGroup { + return &attributionTrackerGroup{ + trackersByUserID: make(map[string]*tracker), limits: limits, mu: sync.RWMutex{}, cooldownTimeout: cooldownTimeout, @@ -26,7 +26,7 @@ func newAttributionTrackerGroup(limits *validation.Overrides, cooldownTimeout ti } // getUserAttributionLabelFromCache is read user attribution label through cache, if not found, get from config -func (atg *AttributionTrackerGroup) getUserAttributionLabelFromCache(userID string) string { +func (atg *attributionTrackerGroup) getUserAttributionLabelFromCache(userID string) string { atg.mu.RLock() defer atg.mu.RUnlock() // if the user is not enabled for cost attribution, we don't need to track the attribution @@ -41,7 +41,7 @@ func (atg *AttributionTrackerGroup) getUserAttributionLabelFromCache(userID stri // getUserAttributionLimitFromCache is read per user attribution limit through cache, if not found, get from config // always call only when the user is enabled for cost attribution -func (atg *AttributionTrackerGroup) getUserAttributionLimitFromCache(userID string) int { +func (atg *attributionTrackerGroup) getUserAttributionLimitFromCache(userID string) int { atg.mu.Lock() defer atg.mu.Unlock() if _, exists := atg.trackersByUserID[userID]; !exists { @@ -51,7 +51,7 @@ func (atg *AttributionTrackerGroup) getUserAttributionLimitFromCache(userID stri } // deleteUserTracerFromCache is delete user from cache since the user is disabled for cost attribution -func (atg *AttributionTrackerGroup) deleteUserTracerFromCache(userID string) { +func (atg *attributionTrackerGroup) deleteUserTracerFromCache(userID string) { atg.mu.Lock() defer atg.mu.Unlock() if _, exists := atg.trackersByUserID[userID]; !exists { @@ -66,7 +66,7 @@ func (atg *AttributionTrackerGroup) deleteUserTracerFromCache(userID string) { // if the label has changed, we will create a new tracker, and won't update the timestamp // if the label has not changed, we will update the attribution timestamp // if the limit is set to 0 or label is empty, we skip the update -func (atg *AttributionTrackerGroup) updateAttributionCacheForUser(userID, label, attribution string, now time.Time) { +func (atg *attributionTrackerGroup) updateAttributionCacheForUser(userID, label, attribution string, now time.Time) { // If the limit is set to 0, we don't need to track the attribution, clean the cache if exists if atg.limits.CostAttributionLabel(userID) == "" || atg.limits.MaxCostAttributionPerUser(userID) <= 0 { atg.deleteUserTracerFromCache(userID) @@ -96,7 +96,7 @@ func (atg *AttributionTrackerGroup) updateAttributionCacheForUser(userID, label, atg.trackersByUserID[userID].attributionTimestamps[attribution] = atomic.NewInt64(ts) } -func (atg *AttributionTrackerGroup) purgeInactiveAttributionsForUser(userID string, deadline int64) []string { +func (atg *attributionTrackerGroup) purgeInactiveAttributionsForUser(userID string, deadline int64) []string { atg.mu.RLock() var inactiveAttributions []string if atg.trackersByUserID[userID] == nil { @@ -145,7 +145,7 @@ func (atg *AttributionTrackerGroup) purgeInactiveAttributionsForUser(userID stri return inactiveAttributions } -func (atg *AttributionTrackerGroup) purgeInactiveAttributions(inactiveTimeout time.Duration) { +func (atg *attributionTrackerGroup) purgeInactiveAttributions(inactiveTimeout time.Duration) { atg.mu.RLock() userIDs := make([]string, 0, len(atg.trackersByUserID)) for userID := range atg.trackersByUserID { @@ -167,7 +167,7 @@ func (atg *AttributionTrackerGroup) purgeInactiveAttributions(inactiveTimeout ti } } -func (atg *AttributionTrackerGroup) attributionLimitExceeded(userID, attribution string, now time.Time) bool { +func (atg *attributionTrackerGroup) attributionLimitExceeded(userID, attribution string, now time.Time) bool { // if we are still at the cooldown period, we will consider the limit reached atg.mu.RLock() defer atg.mu.RUnlock() From b9508e0a8ed657c84ef48d22fef772478a8d86a7 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 16 Oct 2024 17:39:08 +0200 Subject: [PATCH 13/13] renaming and simplify distributor logic --- pkg/costattribution/manager.go | 26 ++++++------ pkg/costattribution/manager_test.go | 16 ++++---- pkg/costattribution/tracker_group_test.go | 2 +- pkg/distributor/distributor.go | 48 +++++++++++----------- pkg/ingester/activeseries/active_series.go | 26 ++++++------ pkg/ingester/ingester.go | 20 ++++----- 6 files changed, 69 insertions(+), 69 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 0aeae09fcd8..ba904e3a981 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -46,10 +46,10 @@ func (m *Manager) EnabledForUser(userID string) bool { return m.attributionTracker.limits.CostAttributionLabel(userID) != "" } -// GetUserAttributionLabel returns the cost attribution label for the user, first it will try to get the label from the cache, +// UserAttributionLabel returns the cost attribution label for the user, first it will try to get the label from the cache, // If not found, it will get the label from the config // If the user is not enabled for cost attribution, it would clean the cache and return empty string -func (m *Manager) GetUserAttributionLabel(userID string) string { +func (m *Manager) UserAttributionLabel(userID string) string { if m.EnabledForUser(userID) { return m.attributionTracker.getUserAttributionLabelFromCache(userID) } @@ -57,10 +57,10 @@ func (m *Manager) GetUserAttributionLabel(userID string) string { return "" } -// GetUserAttributionLimit returns the cost attribution limit for the user, first it will try to get the limit from the cache, +// UserAttributionLimit returns the cost attribution limit for the user, first it will try to get the limit from the cache, // If not found, it will get the limit from the config // If the user is not enabled for cost attribution, it would clean the cache and return 0 -func (m *Manager) GetUserAttributionLimit(userID string) int { +func (m *Manager) UserAttributionLimit(userID string) int { if m.EnabledForUser(userID) { return m.attributionTracker.getUserAttributionLimitFromCache(userID) } @@ -96,7 +96,7 @@ func (m *Manager) UpdateAttributionTimestamp(user string, calb string, lbs label // SetActiveSeries adjust the input attribution and sets the active series gauge for the given user and attribution func (m *Manager) SetActiveSeries(userID, calb, attribution string, value float64) { // if the input label is outdated, we skip the update - if calb != m.GetUserAttributionLabel(userID) { + if calb != m.UserAttributionLabel(userID) { return } attribution = m.adjustUserAttribution(userID, attribution) @@ -109,13 +109,13 @@ func (m *Manager) SetActiveSeries(userID, calb, attribution string, value float6 } // IncrementDiscardedSamples increments the discarded samples counter for a given user and attribution -func (m *Manager) IncrementDiscardedSamples(userID, calb, attribution string, value float64) { - // if the input label is outdated, we skip the update - if calb != m.GetUserAttributionLabel(userID) { +func (m *Manager) IncrementDiscardedSamples(userID string, lbs labels.Labels, value float64, now time.Time) { + if !m.EnabledForUser(userID) { return } + calb := m.UserAttributionLabel(userID) + _, attribution := m.UpdateAttributionTimestamp(userID, calb, lbs, now) - attribution = m.adjustUserAttribution(userID, attribution) m.attributionTracker.mu.RLock() defer m.attributionTracker.mu.RUnlock() if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { @@ -124,12 +124,12 @@ func (m *Manager) IncrementDiscardedSamples(userID, calb, attribution string, va } // IncrementReceivedSamples increments the received samples counter for a given user and attribution -func (m *Manager) IncrementReceivedSamples(userID, calb, attribution string, value float64) { - // if the input label is outdated, we skip the update - if calb != m.GetUserAttributionLabel(userID) { +func (m *Manager) IncrementReceivedSamples(userID string, lbs labels.Labels, value float64, now time.Time) { + if !m.EnabledForUser(userID) { return } - attribution = m.adjustUserAttribution(userID, attribution) + calb := m.UserAttributionLabel(userID) + _, attribution := m.UpdateAttributionTimestamp(userID, calb, lbs, now) m.attributionTracker.mu.RLock() defer m.attributionTracker.mu.RUnlock() if tracker, exists := m.attributionTracker.trackersByUserID[userID]; exists { diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 9108c467921..b24411615ee 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -51,21 +51,21 @@ func Test_EnabledForUser(t *testing.T) { assert.False(t, manager.EnabledForUser("user4"), "Expected cost attribution to be disabled for user4") } -func Test_GetUserAttributionLabel(t *testing.T) { +func Test_UserAttributionLabel(t *testing.T) { manager := newTestManager() - assert.Equal(t, "team", manager.GetUserAttributionLabel("user1")) - assert.Equal(t, "", manager.GetUserAttributionLabel("user2")) - assert.Equal(t, "department", manager.GetUserAttributionLabel("user3")) + assert.Equal(t, "team", manager.UserAttributionLabel("user1")) + assert.Equal(t, "", manager.UserAttributionLabel("user2")) + assert.Equal(t, "department", manager.UserAttributionLabel("user3")) assert.Equal(t, 2, len(manager.attributionTracker.trackersByUserID)) assert.Equal(t, "team", manager.attributionTracker.trackersByUserID["user1"].trackedLabel) assert.Equal(t, "department", manager.attributionTracker.trackersByUserID["user3"].trackedLabel) } -func Test_GetUserAttributionLimit(t *testing.T) { +func Test_UserAttributionLimit(t *testing.T) { manager := newTestManager() - assert.Equal(t, 5, manager.GetUserAttributionLimit("user1")) - assert.Equal(t, 0, manager.GetUserAttributionLimit("user2")) - assert.Equal(t, 0, manager.GetUserAttributionLimit("user4")) + assert.Equal(t, 5, manager.UserAttributionLimit("user1")) + assert.Equal(t, 0, manager.UserAttributionLimit("user2")) + assert.Equal(t, 0, manager.UserAttributionLimit("user4")) } func Test_UpdateAttributionTimestamp(t *testing.T) { diff --git a/pkg/costattribution/tracker_group_test.go b/pkg/costattribution/tracker_group_test.go index 9c7f8330097..787f17e1b04 100644 --- a/pkg/costattribution/tracker_group_test.go +++ b/pkg/costattribution/tracker_group_test.go @@ -64,7 +64,7 @@ func TestUpdateAttributionTimestampForUser(t *testing.T) { }) } -func TestGetUserAttributionLabel(t *testing.T) { +func TestUserAttributionLabel(t *testing.T) { cooldownTimeout := 10 * time.Second t.Run("Should return the cost attribution label for the user", func(t *testing.T) { // Create mock limits diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 7259e1d21a6..300dbb086d9 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -106,7 +106,7 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - costAttributionMng *costattribution.Manager + costAttributionMgr *costattribution.Manager // For handling HA replicas. HATracker *haTracker @@ -307,7 +307,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -342,7 +342,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove healthyInstancesCount: atomic.NewUint32(0), limits: limits, HATracker: haTracker, - costAttributionMng: costAttributionMng, + costAttributionMgr: costAttributionMgr, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -856,6 +856,10 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) + // here if it is a technical error, we don't want to increment the discarded samples counter + if d.costAttributionMgr != nil { + d.costAttributionMgr.IncrementDiscardedSamples(userID, mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), time.Now()) + } } return err @@ -1059,6 +1063,11 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { // Errors in validation are considered non-fatal, as one series in a request may contain // invalid data but all the remaining series could be perfectly valid. if validationErr != nil { + // if the validation failed, we need to increment the discarded samples metric + if d.costAttributionMgr != nil { + d.costAttributionMgr.IncrementDiscardedSamples(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), now) + } + if firstPartialErr == nil { // The series are never retained by validationErr. This is guaranteed by the way the latter is built. firstPartialErr = newValidationError(validationErr) @@ -1109,6 +1118,15 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { + if d.costAttributionMgr != nil { + skipLabelNameValidation := d.cfg.SkipLabelNameValidation || req.GetSkipLabelNameValidation() + for tsIdx, ts := range req.Timeseries { + if validationErr := d.validateSeries(now, &req.Timeseries[tsIdx], userID, group, skipLabelNameValidation, minExemplarTS, maxExemplarTS); validationErr != nil { + continue + } + d.costAttributionMgr.IncrementDiscardedSamples(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), now) + } + } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) @@ -1668,34 +1686,16 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { now := mtime.Now() var receivedSamples, receivedExemplars, receivedMetadata int - costattributionLimit := 0 - caEnabled := d.costAttributionMng != nil && d.costAttributionMng.EnabledForUser(userID) - caLabel := "" - if caEnabled { - costattributionLimit = d.costAttributionMng.GetUserAttributionLimit(userID) - caLabel = d.costAttributionMng.GetUserAttributionLabel(userID) - } - costAttribution := make(map[string]int, costattributionLimit) for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - if caEnabled { - isKeyOutdated, attribution := d.costAttributionMng.UpdateAttributionTimestamp(userID, caLabel, mimirpb.FromLabelAdaptersToLabels(ts.Labels), now) - if isKeyOutdated { - // If the key is outdated, we need to reset cost attribution cache and update cost attribution label - costAttribution = make(map[string]int, costattributionLimit) - caLabel = d.costAttributionMng.GetUserAttributionLabel(userID) - } - costAttribution[attribution]++ + if d.costAttributionMgr != nil { + d.costAttributionMgr.IncrementReceivedSamples(userID, mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), now) } } receivedMetadata = len(req.Metadata) - if caEnabled { - for value, count := range costAttribution { - d.costAttributionMng.IncrementReceivedSamples(userID, caLabel, value, float64(count)) - } - } + d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) d.receivedExemplars.WithLabelValues(userID).Add(float64(receivedExemplars)) d.receivedMetadata.WithLabelValues(userID).Add(float64(receivedMetadata)) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index b998b45611d..938793bac98 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -50,7 +50,7 @@ type ActiveSeries struct { matchers *asmodel.Matchers lastMatchersUpdate time.Time - costAttributionMng *costattribution.Manager + costAttributionMgr *costattribution.Manager // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -68,7 +68,7 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - costAttributionMng *costattribution.Manager + costAttributionMgr *costattribution.Manager mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -98,16 +98,16 @@ func NewActiveSeries( asm *asmodel.Matchers, timeout time.Duration, userID string, - costAttributionMng *costattribution.Manager, + costAttributionMgr *costattribution.Manager, ) *ActiveSeries { c := &ActiveSeries{ matchers: asm, timeout: timeout, userID: userID, - costAttributionMng: costAttributionMng, + costAttributionMgr: costAttributionMgr, } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionMng) + c.stripes[i].reinitialize(asm, &c.deleted, userID, costAttributionMgr) } return c @@ -124,7 +124,7 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { defer c.matchersMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionMng) + c.stripes[i].reinitialize(asm, &c.deleted, c.userID, c.costAttributionMgr) } c.matchers = asm c.lastMatchersUpdate = now @@ -232,7 +232,7 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot } func (c *ActiveSeries) ActiveByAttributionValue(calb string) map[string]uint32 { - total := make(map[string]uint32, c.costAttributionMng.GetUserAttributionLimit(c.userID)) + total := make(map[string]uint32, c.costAttributionMgr.UserAttributionLimit(c.userID)) for s := 0; s < numStripes; s++ { c.stripes[s].mu.RLock() if c.stripes[s].caLabel == calb { @@ -430,12 +430,12 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef // here if we have a cost attribution label, we can split the serie count based on the value of the label // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly - if s.costAttributionMng != nil && s.costAttributionMng.EnabledForUser(s.userID) { - isOutDated, attributionValue := s.costAttributionMng.UpdateAttributionTimestamp(s.userID, s.caLabel, series, time.Unix(0, nowNanos)) + if s.costAttributionMgr != nil && s.costAttributionMgr.EnabledForUser(s.userID) { + isOutDated, attributionValue := s.costAttributionMgr.UpdateAttributionTimestamp(s.userID, s.caLabel, series, time.Unix(0, nowNanos)) if isOutDated { // if the label is outdated, we need to remove the reference to the old value s.costAttributionValues = map[string]uint32{} - s.caLabel = s.costAttributionMng.GetUserAttributionLabel(s.userID) + s.caLabel = s.costAttributionMgr.UserAttributionLabel(s.userID) } s.costAttributionValues[attributionValue]++ e.calabel = s.caLabel @@ -469,7 +469,7 @@ func (s *seriesStripe) reinitialize( asm *asmodel.Matchers, deleted *deletedSeries, userID string, - costAttributionMng *costattribution.Manager, + costAttributionMgr *costattribution.Manager, ) { s.mu.Lock() defer s.mu.Unlock() @@ -485,7 +485,7 @@ func (s *seriesStripe) reinitialize( s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) - s.costAttributionMng = costAttributionMng + s.costAttributionMgr = costAttributionMgr } func (s *seriesStripe) purge(keepUntil time.Time) { @@ -514,7 +514,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.deleted.purge(ref) } delete(s.refs, ref) - // here need to find what is deleted and decrement counters + // TODO: here need to find what is deleted and decrement counters continue } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 7a645440a8c..8b51e68d869 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -311,7 +311,7 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService - costAttributionMng *costattribution.Manager + costAttributionMgr *costattribution.Manager tsdbMetrics *tsdbMetrics @@ -381,7 +381,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMng *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -389,7 +389,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.ingestionRate = util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval) i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService - i.costAttributionMng = costAttributionMng + i.costAttributionMgr = costAttributionMgr // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -790,10 +790,10 @@ func (i *Ingester) updateActiveSeries(now time.Time) { i.metrics.activeSeriesLoading.DeleteLabelValues(userID) if allActive > 0 { if i.isCostAttributionEnabledForUser(userID) { - calb := i.costAttributionMng.GetUserAttributionLabel(userID) + calb := i.costAttributionMgr.UserAttributionLabel(userID) labelAttributions := userDB.activeSeries.ActiveByAttributionValue(calb) for value, count := range labelAttributions { - i.costAttributionMng.SetActiveSeries(userID, calb, value, float64(count)) + i.costAttributionMgr.SetActiveSeries(userID, calb, value, float64(count)) } } i.metrics.activeSeriesPerUser.WithLabelValues(userID).Set(float64(allActive)) @@ -1286,13 +1286,13 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats } if i.isCostAttributionEnabledForUser(userID) { for value, count := range stats.failedSamplesAttribution { - i.costAttributionMng.IncrementDiscardedSamples(userID, stats.attributionLabel, value, float64(count)) + i.costAttributionMgr.IncrementDiscardedSamples(userID, stats.attributionLabel, value, float64(count)) } } } func (i *Ingester) isCostAttributionEnabledForUser(userID string) bool { - return i.costAttributionMng != nil && i.costAttributionMng.EnabledForUser(userID) + return i.costAttributionMgr != nil && i.costAttributionMgr.EnabledForUser(userID) } // pushSamplesToAppender appends samples and exemplars to the appender. Most errors are handled via updateFirstPartial function, @@ -1417,10 +1417,10 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var caValue string // when cost attribution label is set if caEnabled { - isOutDated, caValue = i.costAttributionMng.UpdateAttributionTimestamp(userID, stats.attributionLabel, mimirpb.FromLabelAdaptersToLabels(ts.Labels), startAppend) + isOutDated, caValue = i.costAttributionMgr.UpdateAttributionTimestamp(userID, stats.attributionLabel, mimirpb.FromLabelAdaptersToLabels(ts.Labels), startAppend) // if the cost attribution label is outdated, we need to reset the attribution counter if isOutDated { - stats.attributionLabel = i.costAttributionMng.GetUserAttributionLabel(userID) + stats.attributionLabel = i.costAttributionMgr.UserAttributionLabel(userID) stats.failedSamplesAttribution = make(map[string]int, i.limits.MaxCostAttributionPerUser(userID)) } } @@ -2679,7 +2679,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, userID, - i.costAttributionMng, + i.costAttributionMgr, ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod),