Skip to content

Commit 1cac24f

Browse files
authored
Merge pull request #152214 from dt/backport24.3-151950
release-24.3: backup: only flush per-node progress every 15s
2 parents 91cddbe + 7bd07d7 commit 1cac24f

File tree

1 file changed

+23
-8
lines changed

1 file changed

+23
-8
lines changed

pkg/ccl/backupccl/backup_job.go

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -269,17 +269,35 @@ func backup(
269269
}
270270
}
271271

272-
// Create a channel that is large enough that it does not block.
273-
perNodeProgressCh := make(chan map[execinfrapb.ComponentID]float32, numTotalSpans)
272+
// Create a channel with a little buffering, but plan on dropping if blocked.
273+
perNodeProgressCh := make(chan map[execinfrapb.ComponentID]float32, len(backupSpecs))
274274
storePerNodeProgressLoop := func(ctx context.Context) error {
275+
// track the last progress per component, periodically flushing those that
276+
// have changed to info rows.
277+
current, persisted := make(map[execinfrapb.ComponentID]float32), make(map[execinfrapb.ComponentID]float32)
278+
lastSaved := timeutil.Now()
279+
275280
for {
276281
select {
277282
case prog, ok := <-perNodeProgressCh:
278283
if !ok {
279284
return nil
280285
}
281-
jobsprofiler.StorePerNodeProcessorProgressFraction(
282-
ctx, execCtx.ExecCfg().InternalDB, job.ID(), prog)
286+
for k, v := range prog {
287+
current[k] = v
288+
}
289+
if timeutil.Since(lastSaved) > time.Second*15 {
290+
lastSaved = timeutil.Now()
291+
updates := make(map[execinfrapb.ComponentID]float32)
292+
for k := range current {
293+
if current[k] != persisted[k] {
294+
persisted[k] = current[k]
295+
updates[k] = current[k]
296+
}
297+
}
298+
jobsprofiler.StorePerNodeProcessorProgressFraction(
299+
ctx, execCtx.ExecCfg().InternalDB, job.ID(), updates)
300+
}
283301
case <-ctx.Done():
284302
return ctx.Err()
285303
}
@@ -328,11 +346,8 @@ func backup(
328346
perComponentProgress[component] = fraction
329347
}
330348
select {
331-
// This send to a buffered channel should never block but incase it does
332-
// we will fallthrough to the default case.
333349
case perNodeProgressCh <- perComponentProgress:
334-
default:
335-
log.Warningf(ctx, "skipping persisting per component progress as buffered channel was full")
350+
default: // discard the update if the flusher is backed up.
336351
}
337352

338353
// Check if we should persist a checkpoint backup manifest.

0 commit comments

Comments
 (0)