Skip to content

Commit 66c7611

Browse files
committed
Optimize insertion of rechecks.
This minimizes GC overhead, especially from BSON marshaling.
1 parent c79cdb8 commit 66c7611

File tree

2 files changed

+78
-54
lines changed

2 files changed

+78
-54
lines changed

internal/verifier/recheck.go

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ import (
1010
"github.com/10gen/migration-verifier/internal/types"
1111
"github.com/10gen/migration-verifier/internal/util"
1212
"github.com/pkg/errors"
13+
"github.com/samber/lo"
1314
"go.mongodb.org/mongo-driver/bson"
1415
"go.mongodb.org/mongo-driver/bson/bsontype"
1516
"go.mongodb.org/mongo-driver/mongo"
1617
"go.mongodb.org/mongo-driver/mongo/options"
18+
"go.mongodb.org/mongo-driver/x/bsonx/bsoncore"
1719
)
1820

1921
const (
@@ -31,9 +33,22 @@ const (
3133
// sorting by _id will guarantee that all rechecks for a given
3234
// namespace appear consecutively.
3335
type RecheckPrimaryKey struct {
34-
SrcDatabaseName string `bson:"db"`
35-
SrcCollectionName string `bson:"coll"`
36-
DocumentID any `bson:"docID"`
36+
SrcDatabaseName string `bson:"db"`
37+
SrcCollectionName string `bson:"coll"`
38+
DocumentID bson.RawValue `bson:"docID"`
39+
}
40+
41+
var _ bson.Marshaler = &RecheckPrimaryKey{}
42+
43+
func (rk *RecheckPrimaryKey) MarshalBSON() ([]byte, error) {
44+
return bsoncore.NewDocumentBuilder().
45+
AppendString("db", rk.SrcDatabaseName).
46+
AppendString("coll", rk.SrcCollectionName).
47+
AppendValue("docID", bsoncore.Value{
48+
Type: rk.DocumentID.Type,
49+
Data: rk.DocumentID.Value,
50+
}).
51+
Build(), nil
3752
}
3853

3954
// RecheckDoc stores the necessary information to know which documents must be rechecked.
@@ -46,6 +61,15 @@ type RecheckDoc struct {
4661
DataSize int `bson:"dataSize"`
4762
}
4863

64+
var _ bson.Marshaler = &RecheckDoc{}
65+
66+
func (rd *RecheckDoc) MarshalBSON() ([]byte, error) {
67+
return bsoncore.NewDocumentBuilder().
68+
AppendDocument("_id", lo.Must(bson.Marshal(rd.PrimaryKey))).
69+
AppendInt64("dataSize", int64(rd.DataSize)).
70+
Build(), nil
71+
}
72+
4973
// InsertFailedCompareRecheckDocs is for inserting RecheckDocs based on failures during Check.
5074
func (verifier *Verifier) InsertFailedCompareRecheckDocs(
5175
ctx context.Context,
@@ -95,50 +119,16 @@ func (verifier *Verifier) insertRecheckDocs(
95119

96120
genCollection := verifier.getRecheckQueueCollection(generation)
97121

98-
var recheckBatches [][]mongo.WriteModel
99-
var curRechecks []mongo.WriteModel
100-
curBatchSize := 0
101-
for i, dbName := range dbNames {
102-
recheckDoc := RecheckDoc{
103-
PrimaryKey: RecheckPrimaryKey{
104-
SrcDatabaseName: dbName,
105-
SrcCollectionName: collNames[i],
106-
DocumentID: rawDocIDs[i],
107-
},
108-
DataSize: dataSizes[i],
109-
}
110-
111-
recheckRaw, err := bson.Marshal(recheckDoc)
112-
if err != nil {
113-
return errors.Wrapf(err, "marshaling recheck for %#q", dbName+"."+collNames[i])
114-
}
115-
116-
curRechecks = append(
117-
curRechecks,
118-
mongo.NewInsertOneModel().SetDocument(recheckDoc),
119-
)
120-
curBatchSize += len(recheckRaw)
121-
if curBatchSize > recheckBatchByteLimit || len(curRechecks) >= recheckBatchCountLimit {
122-
recheckBatches = append(recheckBatches, curRechecks)
123-
curRechecks = nil
124-
curBatchSize = 0
125-
}
126-
}
127-
128-
if len(curRechecks) > 0 {
129-
recheckBatches = append(recheckBatches, curRechecks)
130-
}
131-
132-
for _, models := range recheckBatches {
122+
sendRechecks := func(rechecks []bson.Raw) {
133123
eg.Go(func() error {
134124

135125
retryer := retry.New()
136126
err := retryer.WithCallback(
137127
func(retryCtx context.Context, _ *retry.FuncInfo) error {
138-
_, err := genCollection.BulkWrite(
128+
_, err := genCollection.InsertMany(
139129
retryCtx,
140-
models,
141-
options.BulkWrite().SetOrdered(false),
130+
lo.ToAnySlice(rechecks),
131+
options.InsertMany().SetOrdered(false),
142132
)
143133

144134
// We expect duplicate-key errors from the above because:
@@ -157,20 +147,54 @@ func (verifier *Verifier) insertRecheckDocs(
157147
// and document sizes probably remain stable(-ish) across updates.
158148
err = util.TolerateSimpleDuplicateKeyInBulk(
159149
verifier.logger,
160-
len(models),
150+
len(rechecks),
161151
err,
162152
)
163153

164154
return err
165155
},
166156
"persisting %d recheck(s)",
167-
len(models),
157+
len(rechecks),
168158
).Run(groupCtx, verifier.logger)
169159

170-
return errors.Wrapf(err, "batch of %d rechecks", len(models))
160+
return errors.Wrapf(err, "batch of %d rechecks", len(rechecks))
171161
})
172162
}
173163

164+
curRechecks := make([]bson.Raw, 0, recheckBatchCountLimit)
165+
curBatchBytes := 0
166+
for i, dbName := range dbNames {
167+
recheckDoc := RecheckDoc{
168+
PrimaryKey: RecheckPrimaryKey{
169+
SrcDatabaseName: dbName,
170+
SrcCollectionName: collNames[i],
171+
DocumentID: rawDocIDs[i],
172+
},
173+
DataSize: dataSizes[i],
174+
}
175+
176+
recheckRaw, err := bson.Marshal(recheckDoc)
177+
if err != nil {
178+
return errors.Wrapf(err, "marshaling recheck for %#q", dbName+"."+collNames[i])
179+
}
180+
181+
curRechecks = append(
182+
curRechecks,
183+
bson.Raw(recheckRaw),
184+
)
185+
186+
curBatchBytes += len(recheckRaw)
187+
if curBatchBytes > recheckBatchByteLimit || len(curRechecks) >= recheckBatchCountLimit {
188+
sendRechecks(curRechecks)
189+
curRechecks = make([]bson.Raw, 0, recheckBatchCountLimit)
190+
curBatchBytes = 0
191+
}
192+
}
193+
194+
if len(curRechecks) > 0 {
195+
sendRechecks(curRechecks)
196+
}
197+
174198
if err := eg.Wait(); err != nil {
175199
return errors.Wrapf(
176200
err,

internal/verifier/recheck_test.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ func (suite *IntegrationTestSuite) TestFailedCompareThenReplace() {
3939
PrimaryKey: RecheckPrimaryKey{
4040
SrcDatabaseName: "the",
4141
SrcCollectionName: "namespace",
42-
DocumentID: "theDocID",
42+
DocumentID: mbson.ToRawValue("theDocID"),
4343
},
4444
},
4545
},
@@ -74,7 +74,7 @@ func (suite *IntegrationTestSuite) TestFailedCompareThenReplace() {
7474
PrimaryKey: RecheckPrimaryKey{
7575
SrcDatabaseName: "the",
7676
SrcCollectionName: "namespace",
77-
DocumentID: "theDocID",
77+
DocumentID: mbson.ToRawValue("theDocID"),
7878
},
7979
},
8080
},
@@ -282,13 +282,13 @@ func (suite *IntegrationTestSuite) TestLargeIDInsertions() {
282282
PrimaryKey: RecheckPrimaryKey{
283283
SrcDatabaseName: "testDB",
284284
SrcCollectionName: "testColl",
285-
DocumentID: id1,
285+
DocumentID: mbson.ToRawValue(id1),
286286
},
287287
}
288288
d2 := d1
289-
d2.PrimaryKey.DocumentID = id2
289+
d2.PrimaryKey.DocumentID = mbson.ToRawValue(id2)
290290
d3 := d1
291-
d3.PrimaryKey.DocumentID = id3
291+
d3.PrimaryKey.DocumentID = mbson.ToRawValue(id3)
292292

293293
results := suite.fetchRecheckDocs(ctx, verifier)
294294
suite.ElementsMatch([]any{d1, d2, d3}, results)
@@ -342,13 +342,13 @@ func (suite *IntegrationTestSuite) TestLargeDataInsertions() {
342342
PrimaryKey: RecheckPrimaryKey{
343343
SrcDatabaseName: "testDB",
344344
SrcCollectionName: "testColl",
345-
DocumentID: id1,
345+
DocumentID: mbson.ToRawValue(id1),
346346
},
347347
}
348348
d2 := d1
349-
d2.PrimaryKey.DocumentID = id2
349+
d2.PrimaryKey.DocumentID = mbson.ToRawValue(id2)
350350
d3 := d1
351-
d3.PrimaryKey.DocumentID = id3
351+
d3.PrimaryKey.DocumentID = mbson.ToRawValue(id3)
352352

353353
results := suite.fetchRecheckDocs(ctx, verifier)
354354
suite.ElementsMatch([]any{d1, d2, d3}, results)
@@ -451,11 +451,11 @@ func (suite *IntegrationTestSuite) TestGenerationalClear() {
451451
PrimaryKey: RecheckPrimaryKey{
452452
SrcDatabaseName: "testDB",
453453
SrcCollectionName: "testColl",
454-
DocumentID: id1,
454+
DocumentID: mbson.ToRawValue(id1),
455455
},
456456
}
457457
d2 := d1
458-
d2.PrimaryKey.DocumentID = id2
458+
d2.PrimaryKey.DocumentID = mbson.ToRawValue(id2)
459459

460460
results := suite.fetchRecheckDocs(ctx, verifier)
461461
suite.Assert().ElementsMatch([]any{d1, d2}, results)

0 commit comments

Comments
 (0)