diff --git a/atomic_counter.go b/atomic_counter.go deleted file mode 100644 index 7c7e196..0000000 --- a/atomic_counter.go +++ /dev/null @@ -1,167 +0,0 @@ -package chotki - -import ( - "context" - "fmt" - "sync" - "sync/atomic" - "time" - - "github.com/drpcorg/chotki/protocol" - "github.com/drpcorg/chotki/rdx" -) - -var ErrNotCounter error = fmt.Errorf("not a counter") -var ErrCounterNotLoaded error = fmt.Errorf("counter not loaded") -var ErrDecrementN error = fmt.Errorf("decrementing natural counter") - -type AtomicCounter struct { - data atomic.Value - db *Chotki - rid rdx.ID - offset uint64 - lock sync.RWMutex - expiration time.Time - updatePeriod time.Duration -} - -type atomicNcounter struct { - theirs uint64 - total atomic.Uint64 -} - -type zpart struct { - total int64 - revision int64 -} - -type atomicZCounter struct { - theirs int64 - part atomic.Pointer[zpart] -} - -// creates counter that has two properties -// - its atomic as long as you use single instance to do all increments, creating multiple instances will break this guarantee -// - it can ease CPU load if updatePeiod > 0, in that case it will not read from db backend -// current value of the counter -// -// Because we use LSM backend writes are cheap, reads are expensive. You can trade off up to date value of counter -// for less CPU cycles -func NewAtomicCounter(db *Chotki, rid rdx.ID, offset uint64, updatePeriod time.Duration) *AtomicCounter { - return &AtomicCounter{ - db: db, - rid: rid, - offset: offset, - updatePeriod: updatePeriod, - } -} - -func (a *AtomicCounter) load() (any, error) { - now := time.Now() - if a.data.Load() != nil && now.Sub(a.expiration) < 0 { - return a.data.Load(), nil - } - - a.lock.RUnlock() - a.lock.Lock() - defer func() { - a.lock.Unlock() - a.lock.RLock() - }() - - if a.data.Load() != nil && now.Sub(a.expiration) < 0 { - return a.data.Load(), nil - } - - rdt, tlv, err := a.db.ObjectFieldTLV(a.rid.ToOff(a.offset)) - if err != nil { - return nil, err - } - var data any - switch rdt { - case rdx.ZCounter: - total, mine, rev := rdx.Znative3(tlv, a.db.clock.Src()) - part := zpart{total: total, revision: rev} - c := atomicZCounter{ - theirs: total - mine, - part: atomic.Pointer[zpart]{}, - } - c.part.Store(&part) - data = &c - case rdx.Natural: - total, mine := rdx.Nnative2(tlv, a.db.clock.Src()) - c := atomicNcounter{ - theirs: total - mine, - total: atomic.Uint64{}, - } - c.total.Add(total) - data = &c - default: - return nil, ErrNotCounter - } - a.data.Store(data) - a.expiration = now.Add(a.updatePeriod) - return data, nil -} - -func (a *AtomicCounter) Get(ctx context.Context) (int64, error) { - a.lock.RLock() - defer a.lock.RUnlock() - data, err := a.load() - if err != nil { - return 0, err - } - switch c := data.(type) { - case *atomicNcounter: - return int64(c.total.Load()), nil - case *atomicZCounter: - return c.part.Load().total, nil - default: - return 0, ErrCounterNotLoaded - } -} - -// Loads (if needed) and increments counter -func (a *AtomicCounter) Increment(ctx context.Context, val int64) (int64, error) { - a.lock.RLock() - defer a.lock.RUnlock() - data, err := a.load() - if err != nil { - return 0, err - } - var dtlv []byte - var result int64 - var rdt byte - switch c := data.(type) { - case *atomicNcounter: - if val < 0 { - return 0, ErrDecrementN - } - nw := c.total.Add(uint64(val)) - dtlv = rdx.Ntlvt(nw-c.theirs, a.db.clock.Src()) - result = int64(nw) - rdt = rdx.Natural - case *atomicZCounter: - for { - current := c.part.Load() - nw := zpart{ - total: current.total + val, - revision: current.revision + 1, - } - ok := c.part.CompareAndSwap(current, &nw) - if ok { - dtlv = rdx.Ztlvt(nw.total-c.theirs, a.db.clock.Src(), nw.revision) - result = nw.total - rdt = rdx.ZCounter - break - } - } - default: - return 0, ErrCounterNotLoaded - } - changes := make(protocol.Records, 0) - changes = append(changes, protocol.Record('F', rdx.ZipUint64(uint64(a.offset)))) - changes = append(changes, protocol.Record(rdt, dtlv)) - a.db.CommitPacket(ctx, 'E', a.rid.ZeroOff(), changes) - return result, nil -} diff --git a/chotki.go b/chotki.go index f543ad9..351bd6e 100644 --- a/chotki.go +++ b/chotki.go @@ -14,8 +14,15 @@ import ( "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/vfs" + "github.com/drpcorg/chotki/chotki_errors" + "github.com/drpcorg/chotki/classes" + "github.com/drpcorg/chotki/counters" + "github.com/drpcorg/chotki/host" + "github.com/drpcorg/chotki/indexes" + "github.com/drpcorg/chotki/network" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" + "github.com/drpcorg/chotki/replication" "github.com/drpcorg/chotki/utils" "github.com/prometheus/client_golang/prometheus" "github.com/puzpuzpuz/xsync/v3" @@ -28,7 +35,6 @@ var ( ErrHookNotFound = errors.New("chotki: hook not found") ErrBadIRecord = errors.New("chotki: bad id-ref record") ErrBadORecord = errors.New("chotki: bad id-ref record") - ErrBadHPacket = errors.New("chotki: bad handshake packet") ErrBadEPacket = errors.New("chotki: bad E packet") ErrBadVPacket = errors.New("chotki: bad V packet") ErrBadYPacket = errors.New("chotki: bad Y packet") @@ -38,20 +44,13 @@ var ( ErrSrcUnknown = errors.New("chotki: source unknown") ErrSyncUnknown = errors.New("chotki: sync session unknown") ErrBadRRecord = errors.New("chotki: bad ref record") - ErrClosed = errors.New("chotki: no replica open") ErrBadTypeDescription = errors.New("chotki: bad type description") - ErrObjectUnknown = errors.New("chotki: unknown object") - ErrTypeUnknown = errors.New("chotki: unknown object type") ErrUnknownFieldInAType = errors.New("chotki: unknown field for the type") ErrBadClass = errors.New("chotki: bad class description") ErrOutOfOrder = errors.New("chotki: order fail: sequence gap") ErrCausalityBroken = errors.New("chotki: order fail: refs an unknown op") - - ErrFullscanIndexField = errors.New("chotki: field can't have fullscan index") - ErrHashIndexFieldNotFirst = errors.New("chotki: field can't have hash index if type is not FIRST") - ErrHashIndexUinqueConstraintViolation = errors.New("chotki: hash index unique constraint violation") ) var EventsMetric = prometheus.NewCounter(prometheus.CounterOpts{ @@ -146,7 +145,7 @@ func (o *Options) SetDefaults() { var rdt byte switch key[0] { case 'O': - _, rdt = OKeyIdRdt(key) + _, rdt = host.OKeyIdRdt(key) case 'V': rdt = 'V' case 'I': @@ -188,17 +187,17 @@ type Chotki struct { lock sync.RWMutex commitMutex sync.Mutex db *pebble.DB - net *protocol.Net + net *network.Net dir string opts Options log utils.Logger counterCache sync.Map - indexManager *IndexManager + indexManager *indexes.IndexManager outq *xsync.MapOf[string, protocol.DrainCloser] // queues to broadcast all new packets syncs *xsync.MapOf[rdx.ID, *syncPoint] hooks *xsync.MapOf[rdx.ID, []Hook] - types *xsync.MapOf[rdx.ID, Fields] + types *xsync.MapOf[rdx.ID, classes.Fields] } func Exists(dirname string) (bool, error) { @@ -272,12 +271,12 @@ func Open(dirname string, opts Options) (*Chotki, error) { outq: xsync.NewMapOf[string, protocol.DrainCloser](), syncs: xsync.NewMapOf[rdx.ID, *syncPoint](), hooks: xsync.NewMapOf[rdx.ID, []Hook](), - types: xsync.NewMapOf[rdx.ID, Fields](), + types: xsync.NewMapOf[rdx.ID, classes.Fields](), cancelCtx: cancel, waitGroup: &wg, } - cho.net = protocol.NewNet(cho.log, + cho.net = network.NewNet(cho.log, func(name string) protocol.FeedDrainCloserTraced { // new connection queue := utils.NewFDQueue[protocol.Records](cho.opts.BroadcastQueueMaxSize, cho.opts.BroadcastQueueTimeLimit, cho.opts.BroadcastQueueMinBatchSize) @@ -288,15 +287,15 @@ func Open(dirname string, opts Options) (*Chotki, error) { } } - return &Syncer{ + return &replication.Syncer{ Src: cho.src, Host: &cho, - Mode: SyncRWLive, + Mode: replication.SyncRWLive, PingPeriod: cho.opts.PingPeriod, PingWait: cho.opts.PingWait, Name: name, - log: cho.log, - oqueue: queue, + Log: cho.log, + Oqueue: queue, } }, func(name string, p protocol.Traced) { // destroy connection @@ -308,16 +307,16 @@ func Open(dirname string, opts Options) (*Chotki, error) { cho.log.Warn(fmt.Sprintf("closed the old conn to %s", name), "trace_id", p.GetTraceId()) } }, - &protocol.NetTlsConfigOpt{Config: opts.TlsConfig}, - &protocol.NetReadBatchOpt{ + &network.NetTlsConfigOpt{Config: opts.TlsConfig}, + &network.NetReadBatchOpt{ ReadAccumTimeLimit: cho.opts.ReadAccumTimeLimit, BufferMaxSize: cho.opts.ReadMaxBufferSize, BufferMinToProcess: cho.opts.ReadMinBufferSizeToProcess, }, - &protocol.TcpBufferSizeOpt{Read: cho.opts.TcpReadBufferSize, Write: cho.opts.TcpWriteBufferSize}, - &protocol.NetWriteTimeoutOpt{Timeout: cho.opts.WriteTimeout}, + &network.TcpBufferSizeOpt{Read: cho.opts.TcpReadBufferSize, Write: cho.opts.TcpWriteBufferSize}, + &network.NetWriteTimeoutOpt{Timeout: cho.opts.WriteTimeout}, ) - cho.indexManager = newIndexManager(&cho) + cho.indexManager = indexes.NewIndexManager(&cho) wg.Add(1) go func() { defer wg.Done() @@ -394,48 +393,9 @@ func (cho *Chotki) Close() error { return nil } -func (cho *Chotki) Counter(rid rdx.ID, offset uint64, updatePeriod time.Duration) *AtomicCounter { - counter, _ := cho.counterCache.LoadOrStore(rid.ToOff(offset), NewAtomicCounter(cho, rid, offset, updatePeriod)) - return counter.(*AtomicCounter) -} - -func (cho *Chotki) KeepAliveLoop() { - var err error - for err == nil { - time.Sleep(time.Second * 30) - err = cho.KeepAlive() - } - if err != ErrClosed { - cho.log.Error(err.Error()) - cho.log.Error("keep alives stop") - } -} - -func (cho *Chotki) KeepAlive() error { - oid := rdx.IDfromSrcPro(cho.src, 0) - oldtlv, err := cho.ObjectRDTFieldTLV(oid.ToOff(YAckOff), 'V') - if err != nil { - return err - } - mysrc := cho.src - newvv, err := cho.VersionVector() - if err != nil { - return err - } - oldvv := make(rdx.VV) - _ = oldvv.PutTLV(oldtlv) - delete(oldvv, mysrc) - delete(newvv, mysrc) - tlv_delta := rdx.VVdelta(oldvv, newvv) - if len(tlv_delta) == 0 { - return nil - } - d := protocol.Records{ - protocol.Record('F', rdx.ZipUint64(2)), - protocol.Record('V', tlv_delta), - } - _, err = cho.CommitPacket(context.Background(), 'E', oid, d) - return err +func (cho *Chotki) Counter(rid rdx.ID, offset uint64, updatePeriod time.Duration) *counters.AtomicCounter { + counter, _ := cho.counterCache.LoadOrStore(rid.ToOff(offset), counters.NewAtomicCounter(cho, rid, offset, updatePeriod)) + return counter.(*counters.AtomicCounter) } // ToyKV convention key, lit O, then O00000-00000000-000 id @@ -451,6 +411,14 @@ func (cho *Chotki) Last() rdx.ID { return cho.last } +func (cho *Chotki) WriteOptions() *pebble.WriteOptions { + return cho.opts.PebbleWriteOptions +} + +func (cho *Chotki) Logger() utils.Logger { + return cho.log +} + func (cho *Chotki) Snapshot() pebble.Reader { return cho.db.NewSnapshot() } @@ -505,7 +473,7 @@ func (cho *Chotki) Disconnect(addr string) error { } func (cho *Chotki) VersionVector() (vv rdx.VV, err error) { - val, clo, err := cho.db.Get(VKey0) + val, clo, err := cho.db.Get(host.VKey0) if err == nil { vv = make(rdx.VV) err = vv.PutTLV(val) @@ -579,7 +547,7 @@ func (cho *Chotki) CommitPacket(ctx context.Context, lit byte, ref rdx.ID, body DrainTime.WithLabelValues("commit lock").Observe(float64(time.Since(now)) / float64(time.Millisecond)) if cho.db == nil { - return rdx.BadId, ErrClosed + return rdx.BadId, chotki_errors.ErrClosed } id = cho.last.IncPro(1).ZeroOff() i := protocol.Record('I', id.ZipBytes()) @@ -593,12 +561,12 @@ func (cho *Chotki) CommitPacket(ctx context.Context, lit byte, ref rdx.ID, body } type NetCollector struct { - net *protocol.Net + net *network.Net read_buffers_size *prometheus.Desc write_batch_size *prometheus.Desc } -func NewNetCollector(net *protocol.Net) *NetCollector { +func NewNetCollector(net *network.Net) *NetCollector { return &NetCollector{ net: net, read_buffers_size: prometheus.NewDesc("chotki_net_read_buffer_size", "", []string{"peer"}, prometheus.Labels{}), @@ -672,15 +640,15 @@ func (cho *Chotki) Metrics() []prometheus.Collector { EventsBatchSize, NewPebbleCollector(cho.db), NewChotkiCollector(cho), - OpenedIterators, - OpenedSnapshots, - SessionsStates, + replication.OpenedIterators, + replication.OpenedSnapshots, + replication.SessionsStates, DrainTime, - ReindexTaskCount, - ReindexResults, - ReindexDuration, - ReindexCount, - ReindexTaskStates, + indexes.ReindexTaskCount, + indexes.ReindexResults, + indexes.ReindexDuration, + indexes.ReindexCount, + indexes.ReindexTaskStates, } } @@ -692,7 +660,7 @@ func (cho *Chotki) drain(ctx context.Context, recs protocol.Records) (err error) break } - lit, id, ref, body, parseErr := ParsePacket(packet) + lit, id, ref, body, parseErr := replication.ParsePacket(packet) if parseErr != nil { cho.log.WarnCtx(ctx, "bad packet", "err", parseErr) return parseErr @@ -797,15 +765,15 @@ func (cho *Chotki) Drain(ctx context.Context, recs protocol.Records) (err error) cho.lock.RLock() defer cho.lock.RUnlock() if cho.db == nil { - return ErrClosed + return chotki_errors.ErrClosed } EventsBatchSize.Observe(float64(len(recs))) return cho.drain(ctx, recs) } func dumpKVString(key, value []byte) (str string) { - if len(key) == LidLKeyLen { - id, rdt := OKeyIdRdt(key) + if len(key) == host.LidLKeyLen { + id, rdt := host.OKeyIdRdt(key) str = fmt.Sprintf("%s.%c:\t%s", id, rdt, rdx.Xstring(rdt, value)) } return @@ -830,7 +798,7 @@ func (cho *Chotki) DumpVV(writer io.Writer) { } i := cho.db.NewIter(&io) defer i.Close() - for i.SeekGE(VKey0); i.Valid(); i.Next() { + for i.SeekGE(host.VKey0); i.Valid(); i.Next() { id := rdx.IDFromBytes(i.Key()[1:]) vv := make(rdx.VV) _ = vv.PutTLV(i.Value()) diff --git a/chotki_errors/errors.go b/chotki_errors/errors.go new file mode 100644 index 0000000..34151fe --- /dev/null +++ b/chotki_errors/errors.go @@ -0,0 +1,14 @@ +package chotki_errors + +import "errors" + +var ( + ErrObjectUnknown = errors.New("chotki: unknown object") + ErrTypeUnknown = errors.New("chotki: unknown object type") + + ErrFullscanIndexField = errors.New("chotki: field can't have fullscan index") + ErrHashIndexFieldNotFirst = errors.New("chotki: field can't have hash index if type is not FIRST") + ErrHashIndexUinqueConstraintViolation = errors.New("chotki: hash index unique constraint violation") + ErrBadHPacket = errors.New("chotki: bad handshake packet") + ErrClosed = errors.New("chotki: no replica open") +) diff --git a/chotki_index_test.go b/chotki_index_test.go index 9a84a7a..47092fd 100644 --- a/chotki_index_test.go +++ b/chotki_index_test.go @@ -6,13 +6,16 @@ import ( "testing" "time" + "github.com/drpcorg/chotki/chotki_errors" + "github.com/drpcorg/chotki/classes" "github.com/drpcorg/chotki/rdx" + testutils "github.com/drpcorg/chotki/test_utils" "github.com/drpcorg/chotki/utils" "github.com/stretchr/testify/assert" ) -var SchemaIndex = []Field{ - {Name: "test", RdxType: rdx.String, Index: HashIndex}, +var SchemaIndex = []classes.Field{ + {Name: "test", RdxType: rdx.String, Index: classes.HashIndex}, } func TestFullScanIndexSync(t *testing.T) { @@ -52,8 +55,7 @@ func TestFullScanIndexSync(t *testing.T) { data = append(data, *item) } assert.Equal(t, []Test{{Test: "test1"}}, data, "index in sync check after local update") - - syncData(a, b) + testutils.SyncData(a, b) borm := b.ObjectMapper() defer borm.Close() @@ -112,7 +114,7 @@ func TestHashIndexSyncCreateObject(t *testing.T) { assert.NoError(t, err) // sync data before creating object - syncData(a, b) + testutils.SyncData(a, b) aorm := a.ObjectMapper() defer aorm.Close() @@ -128,7 +130,7 @@ func TestHashIndexSyncCreateObject(t *testing.T) { assert.NoError(t, err) assert.Equal(t, &Test{Test: "test1"}, test1data, "index in sync check after local update") - syncData(a, b) + testutils.SyncData(a, b) borm := b.ObjectMapper() defer borm.Close() @@ -193,7 +195,7 @@ func TestHashIndexSyncEditObject(t *testing.T) { aorm.UpdateAll() // sync data before creating object - syncData(a, b) + testutils.SyncData(a, b) test1data, err := GetByHash[*Test](aorm, cid, 1, []byte("test1")) assert.NoError(t, err) @@ -208,9 +210,9 @@ func TestHashIndexSyncEditObject(t *testing.T) { assert.NoError(t, err) assert.Equal(t, &Test{Test: "test10"}, test1data, "index in sync after local object edit") _, err = GetByHash[*Test](aorm, cid, 1, []byte("test1")) - assert.Error(t, ErrObjectUnknown, err) + assert.Error(t, chotki_errors.ErrObjectUnknown, err) - syncData(a, b) + testutils.SyncData(a, b) borm := b.ObjectMapper() defer borm.Close() @@ -273,7 +275,7 @@ func TestHashIndexRepairIndex(t *testing.T) { aorm.UpdateAll() - syncData(a, b) + testutils.SyncData(a, b) time.Sleep(time.Second * 1) borm := b.ObjectMapper() @@ -312,7 +314,7 @@ func TestHashIndexUniqueConstraint(t *testing.T) { ob2 := Test{Test: "test1"} err = aorm.New(context.Background(), cid, &ob2) assert.Error(t, err, "should fail when creating object with duplicate indexed field value") - assert.ErrorIs(t, err, ErrHashIndexUinqueConstraintViolation) + assert.ErrorIs(t, err, chotki_errors.ErrHashIndexUinqueConstraintViolation) // Verify only one object exists data := make([]Test, 0) diff --git a/chotki_test.go b/chotki_test.go index 093ed60..1b0cc99 100644 --- a/chotki_test.go +++ b/chotki_test.go @@ -11,8 +11,12 @@ import ( "time" "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki/classes" + "github.com/drpcorg/chotki/host" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" + "github.com/drpcorg/chotki/replication" + testutils "github.com/drpcorg/chotki/test_utils" "github.com/drpcorg/chotki/utils" "github.com/stretchr/testify/assert" ) @@ -46,12 +50,12 @@ func testdirs(origs ...uint64) ([]string, func()) { func TestChotki_Debug(t *testing.T) { oid := rdx.IDFromSrcSeqOff(0x1e, 0x1ab, 0) - key := OKey(oid.ToOff(1), 'I') + key := host.OKey(oid.ToOff(1), 'I') value := rdx.Itlv(-13) str := dumpKVString(key, value) assert.Equal(t, "1e-1ab-1.I:\t-13", string(str)) - skey := OKey(oid.ToOff(2), 'S') + skey := host.OKey(oid.ToOff(2), 'S') svalue := rdx.Stlv("funny\tstring\n") sstr := dumpKVString(skey, svalue) assert.Equal(t, "1e-1ab-2.S:\t\"funny\\tstring\\n\"", string(sstr)) @@ -85,7 +89,7 @@ func TestChotki_Sync(t *testing.T) { b, err := Open(dirs[1], Options{Src: 0xb, Name: "test replica B"}) assert.Nil(t, err) - syncData(a, b) + testutils.SyncData(a, b) bvv, err := b.VersionVector() assert.Nil(t, err) @@ -115,14 +119,14 @@ func TestChotki_SyncEdit(t *testing.T) { assert.NoError(t, err) objectId := orm.FindID(obj) orm.Close() - syncData(a, b) + testutils.SyncData(a, b) orm = a.ObjectMapper() resa, err := orm.Load(objectId, &Test{}) assert.NoError(t, err) resa.(*Test).Test = "edited text" assert.NoError(t, orm.Save(context.Background(), resa)) - syncData(a, b) + testutils.SyncData(a, b) borm := b.ObjectMapper() res, err := borm.Load(objectId, &Test{}) @@ -144,44 +148,44 @@ func TestChotki_SyncLivePingsOk(t *testing.T) { b, err := Open(dirs[1], Options{Src: 0xb, Name: "test replica B", Logger: utils.NewDefaultLogger(slog.LevelInfo)}) assert.Nil(t, err) - synca := Syncer{ + synca := replication.Syncer{ Host: a, PingPeriod: 100 * time.Millisecond, PingWait: 200 * time.Millisecond, - Mode: SyncRWLive, Name: "a", + Mode: replication.SyncRWLive, Name: "a", Src: a.src, - log: utils.NewDefaultLogger(slog.LevelDebug), - oqueue: &FeedCloserTest{}, + Log: utils.NewDefaultLogger(slog.LevelDebug), + Oqueue: &FeedCloserTest{}, } - syncb := Syncer{ + syncb := replication.Syncer{ Host: b, PingPeriod: 100 * time.Second, - Mode: SyncRWLive, + Mode: replication.SyncRWLive, PingWait: 3 * time.Second, Name: "b", Src: b.src, - log: utils.NewDefaultLogger(slog.LevelDebug), - oqueue: &FeedCloserTest{}, + Log: utils.NewDefaultLogger(slog.LevelDebug), + Oqueue: &FeedCloserTest{}, } ctx, cancel := context.WithCancel(context.Background()) go protocol.PumpCtxCallback(ctx, &synca, &syncb, func() bool { - return synca.GetFeedState() != SendPing + return synca.GetFeedState() != replication.SendPing }) go protocol.PumpCtx(ctx, &syncb, &synca) time.Sleep(time.Millisecond * 10) - assert.Equal(t, SendLive, synca.GetFeedState()) - assert.Equal(t, SendLive, syncb.GetFeedState()) - assert.Equal(t, SendDiff, synca.GetDrainState()) - assert.Equal(t, SendDiff, syncb.GetDrainState()) + assert.Equal(t, replication.SendLive, synca.GetFeedState()) + assert.Equal(t, replication.SendLive, syncb.GetFeedState()) + assert.Equal(t, replication.SendDiff, synca.GetDrainState()) + assert.Equal(t, replication.SendDiff, syncb.GetDrainState()) time.Sleep(time.Millisecond * 110) - assert.Equal(t, SendPing, synca.GetFeedState()) + assert.Equal(t, replication.SendPing, synca.GetFeedState()) go protocol.PumpCtx(ctx, &synca, &syncb) time.Sleep(time.Millisecond * 90) - assert.Equal(t, SendLive, synca.GetFeedState()) - assert.Equal(t, SendLive, syncb.GetFeedState()) + assert.Equal(t, replication.SendLive, synca.GetFeedState()) + assert.Equal(t, replication.SendLive, syncb.GetFeedState()) cancel() // wait until everything stopped time.Sleep(time.Millisecond * 100) @@ -201,48 +205,48 @@ func TestChotki_SyncLivePingsFail(t *testing.T) { b, err := Open(dirs[1], Options{Src: 0xb, Name: "test replica B", Logger: utils.NewDefaultLogger(slog.LevelInfo)}) assert.Nil(t, err) - synca := Syncer{ + synca := replication.Syncer{ Host: a, PingPeriod: 100 * time.Millisecond, PingWait: 100 * time.Millisecond, - Mode: SyncRWLive, Name: "a", + Mode: replication.SyncRWLive, Name: "a", Src: a.src, - log: utils.NewDefaultLogger(slog.LevelDebug), - oqueue: &FeedCloserTest{}, + Log: utils.NewDefaultLogger(slog.LevelDebug), + Oqueue: &FeedCloserTest{}, } - syncb := Syncer{ + syncb := replication.Syncer{ Host: b, PingPeriod: 100 * time.Second, - Mode: SyncRWLive, + Mode: replication.SyncRWLive, PingWait: 3 * time.Second, Name: "b", Src: b.src, - log: utils.NewDefaultLogger(slog.LevelDebug), - oqueue: &FeedCloserTest{}, + Log: utils.NewDefaultLogger(slog.LevelDebug), + Oqueue: &FeedCloserTest{}, } ctx, cancel := context.WithCancel(context.Background()) go protocol.PumpCtxCallback(ctx, &synca, &syncb, func() bool { - return synca.GetFeedState() != SendPing + return synca.GetFeedState() != replication.SendPing }) go protocol.PumpCtxCallback(ctx, &syncb, &synca, func() bool { - return syncb.GetFeedState() != SendPong + return syncb.GetFeedState() != replication.SendPong }) time.Sleep(time.Millisecond * 10) - assert.Equal(t, SendLive, synca.GetFeedState()) - assert.Equal(t, SendLive, syncb.GetFeedState()) - assert.Equal(t, SendDiff, synca.GetDrainState()) - assert.Equal(t, SendDiff, syncb.GetDrainState()) + assert.Equal(t, replication.SendLive, synca.GetFeedState()) + assert.Equal(t, replication.SendLive, syncb.GetFeedState()) + assert.Equal(t, replication.SendDiff, synca.GetDrainState()) + assert.Equal(t, replication.SendDiff, syncb.GetDrainState()) time.Sleep(time.Millisecond * 110) - assert.Equal(t, SendPing, synca.GetFeedState()) + assert.Equal(t, replication.SendPing, synca.GetFeedState()) go protocol.PumpCtx(ctx, &synca, &syncb) time.Sleep(time.Millisecond * 200) - assert.Equal(t, SendNone, synca.GetFeedState()) - assert.Equal(t, SendPong, syncb.GetFeedState()) - assert.Equal(t, SendDiff, synca.GetDrainState()) - assert.Equal(t, SendNone, syncb.GetDrainState()) + assert.Equal(t, replication.SendNone, synca.GetFeedState()) + assert.Equal(t, replication.SendPong, syncb.GetFeedState()) + assert.Equal(t, replication.SendDiff, synca.GetDrainState()) + assert.Equal(t, replication.SendNone, syncb.GetDrainState()) cancel() syncb.Close() @@ -267,7 +271,7 @@ func TestChotki_SyncGlobals(t *testing.T) { b, err := Open(dirs[1], Options{Src: 0xb, Name: "test replica B"}) assert.Nil(t, err) - syncData(a, b) + testutils.SyncData(a, b) names, err := b.MapTRField(IdNames) assert.Nil(t, err) @@ -332,7 +336,7 @@ func TestChotki_CheckMdeltaTR(t *testing.T) { b, err := Open(dirs[1], Options{Src: 0xb, Name: "test replica B"}) assert.Nil(t, err) - syncData(a, b) + testutils.SyncData(a, b) //check on second replica names, err = b.MapTRField(IdNames) @@ -345,38 +349,6 @@ func TestChotki_CheckMdeltaTR(t *testing.T) { _ = b.Close() } -func syncData(a, b *Chotki) error { - synca := Syncer{ - Host: a, - Mode: SyncRW, - Name: "a", - WaitUntilNone: time.Millisecond, - Src: a.src, - log: utils.NewDefaultLogger(slog.LevelError), - PingWait: time.Second, - } - syncb := Syncer{ - Host: b, - Mode: SyncRW, - WaitUntilNone: time.Millisecond, - Name: "b", - Src: b.src, - log: utils.NewDefaultLogger(slog.LevelError), - PingWait: time.Second, - } - defer syncb.Close() - defer synca.Close() - // send handshake from b to a - err := protocol.Relay(&syncb, &synca) - if err != nil { - return err - } - go protocol.Pump(&syncb, &synca) - // send data a -> b - return protocol.Pump(&synca, &syncb) - -} - func TestChotki_Sync3(t *testing.T) { dirs, clear := testdirs(0xa, 0xb, 0xc) defer clear() @@ -396,8 +368,8 @@ func TestChotki_Sync3(t *testing.T) { assert.NoError(t, err) // sync class a -> b -> c - assert.Equal(t, io.EOF, syncData(a, b)) - assert.Equal(t, io.EOF, syncData(b, c)) + assert.Equal(t, io.EOF, testutils.SyncData(a, b)) + assert.Equal(t, io.EOF, testutils.SyncData(b, c)) for _, db := range []*Chotki{a, b, c} { obj := &Test{ @@ -410,9 +382,9 @@ func TestChotki_Sync3(t *testing.T) { orm.Close() } - assert.Equal(t, io.EOF, syncData(b, c)) - assert.Equal(t, io.EOF, syncData(a, b)) - assert.Equal(t, io.EOF, syncData(b, c)) + assert.Equal(t, io.EOF, testutils.SyncData(b, c)) + assert.Equal(t, io.EOF, testutils.SyncData(a, b)) + assert.Equal(t, io.EOF, testutils.SyncData(b, c)) for _, db := range []*Chotki{a, b, c} { orm := db.ObjectMapper() @@ -429,7 +401,7 @@ func TestChotki_Sync3(t *testing.T) { _ = c.Close() } -var Schema = []Field{ +var Schema = []classes.Field{ {Name: "test", RdxType: rdx.String}, } @@ -484,18 +456,18 @@ func TestChotki_ClassEdit(t *testing.T) { sc, err := a.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema), sc[1:]) + assert.Equal(t, classes.Fields(Schema), sc[1:]) _, err = b.ClassFields(cid) assert.Error(t, err) - syncData(a, b) + testutils.SyncData(a, b) sc, err = b.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema), sc[1:]) + assert.Equal(t, classes.Fields(Schema), sc[1:]) - Schema2 := []Field{ + Schema2 := []classes.Field{ {Name: "test", RdxType: rdx.String, Offset: 0}, {Name: "test2", RdxType: rdx.Integer, Offset: 1}, } @@ -505,13 +477,13 @@ func TestChotki_ClassEdit(t *testing.T) { sc, err = a.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema2), sc[1:]) + assert.Equal(t, classes.Fields(Schema2), sc[1:]) - syncData(a, b) + testutils.SyncData(a, b) sc, err = b.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema2), sc[1:]) + assert.Equal(t, classes.Fields(Schema2), sc[1:]) _ = a.Close() _ = b.Close() @@ -535,30 +507,30 @@ func TestChotki_ClassEdit_Live(t *testing.T) { sc, err := a.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema), sc[1:]) + assert.Equal(t, classes.Fields(Schema), sc[1:]) _, err = b.ClassFields(cid) assert.Error(t, err) - synca := Syncer{ + synca := replication.Syncer{ Host: a, PingPeriod: 100 * time.Second, PingWait: 3 * time.Second, - Mode: SyncRWLive, + Mode: replication.SyncRWLive, Name: "a", Src: a.src, - log: utils.NewDefaultLogger(slog.LevelDebug), - oqueue: &FeedCloserTest{}, + Log: utils.NewDefaultLogger(slog.LevelDebug), + Oqueue: &FeedCloserTest{}, } - syncb := Syncer{ + syncb := replication.Syncer{ Host: b, PingPeriod: 100 * time.Second, - Mode: SyncRWLive, + Mode: replication.SyncRWLive, PingWait: 3 * time.Second, Name: "b", Src: b.src, - log: utils.NewDefaultLogger(slog.LevelDebug), - oqueue: &FeedCloserTest{}, + Log: utils.NewDefaultLogger(slog.LevelDebug), + Oqueue: &FeedCloserTest{}, } ctx, cancel := context.WithCancel(context.Background()) @@ -572,9 +544,9 @@ func TestChotki_ClassEdit_Live(t *testing.T) { sc, err = b.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema), sc[1:]) + assert.Equal(t, classes.Fields(Schema), sc[1:]) - Schema2 := []Field{ + Schema2 := []classes.Field{ {Name: "test", RdxType: rdx.String, Offset: 0}, {Name: "test2", RdxType: rdx.Integer, Offset: 1}, } @@ -584,13 +556,13 @@ func TestChotki_ClassEdit_Live(t *testing.T) { sc, err = a.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema2), sc[1:]) + assert.Equal(t, classes.Fields(Schema2), sc[1:]) time.Sleep(10 * time.Millisecond) sc, err = b.ClassFields(cid) assert.NoError(t, err) - assert.Equal(t, Fields(Schema2), sc[1:]) + assert.Equal(t, classes.Fields(Schema2), sc[1:]) syncb.Close() synca.Close() diff --git a/classes/fields.go b/classes/fields.go new file mode 100644 index 0000000..66f6c9e --- /dev/null +++ b/classes/fields.go @@ -0,0 +1,70 @@ +package classes + +// A class contains a number of fields. Each Field has +// some RDT type. A class can inherit another class. +// New fields can be appended to a class, but never removed. +// Max number of fields is 128, max inheritance depth 32. +// When stored, a class is an append-only sequence of Ts. +// The syntax for each T: "XName", where X is the RDT. +// For the map types, can use "MSS_Name" or similar. +// Each field has an Offset. The Offset+RdxType pair is the +// *actual key* for the field in the database. +// Entries having identical Offset+RdxType are considered *renames*! + +import "unicode/utf8" + +type IndexType byte + +const ( + HashIndex IndexType = 'H' + FullscanIndex IndexType = 'F' +) + +type Field struct { + Offset int64 + Name string + RdxType byte + RdxTypeExt []byte + Index IndexType +} + +// Fields +type Fields []Field + +func (f Field) Valid() bool { + for _, l := range f.Name { // has unsafe chars + if l < ' ' { + return false + } + } + + return (f.RdxType >= 'A' && f.RdxType <= 'Z' && + len(f.Name) > 0 && utf8.ValidString(f.Name)) +} + +func (fs Fields) MaxOffset() (off int64) { + for _, f := range fs { + if f.Offset > off { + off = f.Offset + } + } + return +} + +func (f Fields) FindRdtOff(rdx byte, off int64) int { + for i := 0; i < len(f); i++ { + if f[i].RdxType == rdx && f[i].Offset == off { + return i + } + } + return -1 +} + +func (f Fields) FindName(name string) (ndx int) { // fixme double naming? + for i := 0; i < len(f); i++ { + if f[i].Name == name { + return i + } + } + return -1 +} diff --git a/classes/parse.go b/classes/parse.go new file mode 100644 index 0000000..cbf9d79 --- /dev/null +++ b/classes/parse.go @@ -0,0 +1,32 @@ +package classes + +import "github.com/drpcorg/chotki/rdx" + +func ParseClass(tlv []byte) (fields Fields) { + it := rdx.FIRSTIterator{TLV: tlv} + fields = append(fields, Field{ // todo inheritance + Offset: 0, + Name: "_ref", + RdxType: rdx.Reference, + }) + for it.Next() { + lit, t, name := it.ParsedValue() + if lit != rdx.Term || len(name) == 0 { + break // todo unique names etc + } + rdt := rdx.String + index := IndexType(0) + if name[0] >= 'A' && name[0] <= 'Z' { + rdt = name[0] + index = IndexType(name[1]) + name = name[2:] + } + fields = append(fields, Field{ + Offset: t.Rev, + RdxType: rdt, + Name: string(name), + Index: index, + }) + } + return +} diff --git a/counters/atomic_counter.go b/counters/atomic_counter.go new file mode 100644 index 0000000..802e786 --- /dev/null +++ b/counters/atomic_counter.go @@ -0,0 +1,244 @@ +// Package chotki provides AtomicCounter - a high-performance atomic counter implementation +// for distributed systems with CRDT semantics. +// +// # AtomicCounter Architecture +// +// AtomicCounter provides atomic increment operations in distributed environments while +// optimizing performance through intelligent caching. It supports Natural (increment-only) +// and ZCounter (two-way) types with CRDT merge semantics. +// +// ## Core Design Principle +// +// The counter trades CPU usage for data freshness, but **only for data from other replicas**. +// All writes to the current replica are immediately reflected in the counter value. +// Caching only affects how frequently we read data that arrived via synchronization. +// +// ## How It Works +// +// The counter uses a lazy loading pattern with time-based caching. When data is requested: +// +// 1. **Cache Check**: If cached data hasn't expired, return it immediately +// 2. **Database Load**: Otherwise, load fresh data from the LSM database +// 3. **Parse & Cache**: Parse TLV data into internal structures and cache with expiration +// +// For increments, the process is: +// +// 1. **Load Data**: Get current counter state (cached or from DB) +// 2. **Atomic Update**: Use Go's atomic primitives to update the value +// 3. **Generate TLV**: Create TLV records for persistence +// 4. **Commit**: Write changes to database with CRDT merge semantics +// +// ## Internal Structure +// +// The counter maintains two internal representations: +// +// - **atomicNcounter**: For Natural counters, uses atomic.Uint64 for thread-safe increments +// - **atomicZCounter**: For ZCounter, uses atomic.Pointer with revision tracking for conflict resolution +// +// ## Performance Trade-offs +// +// The design trades CPU usage for freshness of **synchronized data from other replicas**. +// With updatePeriod > 0, the counter caches data to avoid expensive database reads, +// but may return slightly stale values from other replicas. Local writes are always +// immediately visible. With updatePeriod = 0, it always reads fresh synchronized data. +// +// ## Thread Safety +// +// Operations are atomic when using a single instance. Multiple instances may have +// race conditions due to the distributed nature of the system. +// +// ## Example: Cache vs Local Writes +// +// ```go +// counter := NewAtomicCounter(db, objectID, fieldOffset, 1*time.Second) +// +// // Local write - immediately visible +// counter.Increment(ctx, 5) // Value: 5 +// value, _ := counter.Get(ctx) // Returns 5 immediately +// +// // After sync from other replica (value: 10) +// // With cache: may still return 5 until cache expires +// // Without cache: immediately returns 15 (5 + 10) +// ``` + +package counters + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + "time" + + "github.com/drpcorg/chotki/host" + "github.com/drpcorg/chotki/protocol" + "github.com/drpcorg/chotki/rdx" +) + +var ErrNotCounter error = fmt.Errorf("not a counter") +var ErrCounterNotLoaded error = fmt.Errorf("counter not loaded") +var ErrDecrementN error = fmt.Errorf("decrementing natural counter") + +type AtomicCounter struct { + data atomic.Value + db host.Host + rid rdx.ID + offset uint64 + lock sync.RWMutex + expiration time.Time + updatePeriod time.Duration +} + +type atomicNcounter struct { + theirs uint64 + total atomic.Uint64 +} + +type zpart struct { + total int64 + revision int64 +} + +type atomicZCounter struct { + theirs int64 + part atomic.Pointer[zpart] +} + +// NewAtomicCounter creates a new atomic counter instance. +// +// The counter uses lazy loading with time-based caching. When updatePeriod > 0, +// data is cached to avoid expensive database reads, but may return stale values. +// When updatePeriod = 0, fresh data is always read from the database. +func NewAtomicCounter(db host.Host, rid rdx.ID, offset uint64, updatePeriod time.Duration) *AtomicCounter { + return &AtomicCounter{ + db: db, + rid: rid, + offset: offset, + updatePeriod: updatePeriod, + } +} + +// load retrieves and caches counter data from the database. +// +// Uses double-checked locking: first checks cache without lock, then acquires +// write lock only if cache is expired. Loads TLV data from database and parses +// into internal structures (atomicNcounter for Natural, atomicZCounter for ZCounter). +// This method only affects how frequently we read synchronized data from other replicas. +// Local writes are always immediately visible regardless of cache state. +func (a *AtomicCounter) load() (any, error) { + now := time.Now() + if a.data.Load() != nil && now.Sub(a.expiration) < 0 { + return a.data.Load(), nil + } + + a.lock.RUnlock() + a.lock.Lock() + defer func() { + a.lock.Unlock() + a.lock.RLock() + }() + + if a.data.Load() != nil && now.Sub(a.expiration) < 0 { + return a.data.Load(), nil + } + + rdt, tlv, err := a.db.ObjectFieldTLV(a.rid.ToOff(a.offset)) + if err != nil { + return nil, err + } + var data any + switch rdt { + case rdx.ZCounter: + total, mine, rev := rdx.Znative3(tlv, a.db.Source()) + part := zpart{total: total, revision: rev} + c := atomicZCounter{ + theirs: total - mine, + part: atomic.Pointer[zpart]{}, + } + c.part.Store(&part) + data = &c + case rdx.Natural: + total, mine := rdx.Nnative2(tlv, a.db.Source()) + c := atomicNcounter{ + theirs: total - mine, + total: atomic.Uint64{}, + } + c.total.Add(total) + data = &c + default: + return nil, ErrNotCounter + } + a.data.Store(data) + a.expiration = now.Add(a.updatePeriod) + return data, nil +} + +// Get retrieves the current value of the counter. +// +// Acquires read lock, loads data (cached or from DB), and returns the total value. +// For Natural counters returns sum of all replica contributions, for ZCounter returns current total. +func (a *AtomicCounter) Get(ctx context.Context) (int64, error) { + a.lock.RLock() + defer a.lock.RUnlock() + data, err := a.load() + if err != nil { + return 0, err + } + switch c := data.(type) { + case *atomicNcounter: + return int64(c.total.Load()), nil + case *atomicZCounter: + return c.part.Load().total, nil + default: + return 0, ErrCounterNotLoaded + } +} + +// Increment atomically increments the counter by the specified value. +// +// Loads current data, performs atomic update using Go primitives (atomic.Uint64 for Natural, +// CompareAndSwap for ZCounter), generates TLV data, and commits to database with CRDT semantics. +// Natural counters only allow positive increments, ZCounter supports both positive and negative. +func (a *AtomicCounter) Increment(ctx context.Context, val int64) (int64, error) { + a.lock.RLock() + defer a.lock.RUnlock() + data, err := a.load() + if err != nil { + return 0, err + } + var dtlv []byte + var result int64 + var rdt byte + switch c := data.(type) { + case *atomicNcounter: + if val < 0 { + return 0, ErrDecrementN + } + nw := c.total.Add(uint64(val)) + dtlv = rdx.Ntlvt(nw-c.theirs, a.db.Source()) + result = int64(nw) + rdt = rdx.Natural + case *atomicZCounter: + for { + current := c.part.Load() + nw := zpart{ + total: current.total + val, + revision: current.revision + 1, + } + ok := c.part.CompareAndSwap(current, &nw) + if ok { + dtlv = rdx.Ztlvt(nw.total-c.theirs, a.db.Source(), nw.revision) + result = nw.total + rdt = rdx.ZCounter + break + } + } + default: + return 0, ErrCounterNotLoaded + } + changes := make(protocol.Records, 0) + changes = append(changes, protocol.Record('F', rdx.ZipUint64(uint64(a.offset)))) + changes = append(changes, protocol.Record(rdt, dtlv)) + a.db.CommitPacket(ctx, 'E', a.rid.ZeroOff(), changes) + return result, nil +} diff --git a/atomic_counter_test.go b/counters/atomic_counter_test.go similarity index 75% rename from atomic_counter_test.go rename to counters/atomic_counter_test.go index a7f4c39..cb0e574 100644 --- a/atomic_counter_test.go +++ b/counters/atomic_counter_test.go @@ -1,4 +1,4 @@ -package chotki +package counters_test import ( "context" @@ -8,8 +8,12 @@ import ( "time" "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki" + "github.com/drpcorg/chotki/classes" + "github.com/drpcorg/chotki/counters" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" + testutils "github.com/drpcorg/chotki/test_utils" "github.com/stretchr/testify/assert" ) @@ -17,21 +21,21 @@ func TestAtomicCounter(t *testing.T) { dir, err := os.MkdirTemp("", "*") assert.NoError(t, err) - a, err := Open(dir, Options{ + a, err := chotki.Open(dir, chotki.Options{ Src: 0x1a, Name: "test replica", Options: pebble.Options{ErrorIfExists: true}, }) assert.NoError(t, err) - cid, err := a.NewClass(context.Background(), rdx.ID0, Field{Name: "test", RdxType: rdx.Natural}) + cid, err := a.NewClass(context.Background(), rdx.ID0, classes.Field{Name: "test", RdxType: rdx.Natural}) assert.NoError(t, err) rid, err := a.NewObjectTLV(context.Background(), cid, protocol.Records{protocol.Record('N', rdx.Ntlv(0))}) assert.NoError(t, err) - counterA := NewAtomicCounter(a, rid, 1, 0) - counterB := NewAtomicCounter(a, rid, 1, 0) + counterA := counters.NewAtomicCounter(a, rid, 1, 0) + counterB := counters.NewAtomicCounter(a, rid, 1, 0) res, err := counterA.Increment(context.Background(), 1) assert.NoError(t, err) @@ -50,7 +54,7 @@ func TestAtomicCounterWithPeriodicUpdate(t *testing.T) { dira, err := os.MkdirTemp("", "*") assert.NoError(t, err) - a, err := Open(dira, Options{ + a, err := chotki.Open(dira, chotki.Options{ Src: 0x1a, Name: "test replica", Options: pebble.Options{ErrorIfExists: true}, @@ -60,7 +64,7 @@ func TestAtomicCounterWithPeriodicUpdate(t *testing.T) { dirb, err := os.MkdirTemp("", "*") assert.NoError(t, err) - b, err := Open(dirb, Options{ + b, err := chotki.Open(dirb, chotki.Options{ Src: 0x1b, Name: "test replica2", Options: pebble.Options{ErrorIfExists: true}, @@ -69,8 +73,8 @@ func TestAtomicCounterWithPeriodicUpdate(t *testing.T) { cid, err := a.NewClass( context.Background(), rdx.ID0, - Field{Name: "test", RdxType: rdx.Natural}, - Field{Name: "test2", RdxType: rdx.ZCounter}, + classes.Field{Name: "test", RdxType: rdx.Natural}, + classes.Field{Name: "test2", RdxType: rdx.ZCounter}, ) assert.NoError(t, err) @@ -88,22 +92,20 @@ func TestAtomicCounterWithPeriodicUpdate(t *testing.T) { for i := 1; i <= 2; i++ { - counterA := NewAtomicCounter(a, rid, uint64(i), 100*time.Millisecond) - counterB := NewAtomicCounter(b, rid, uint64(i), 0) + counterA := counters.NewAtomicCounter(a, rid, uint64(i), 100*time.Millisecond) + counterB := counters.NewAtomicCounter(b, rid, uint64(i), 0) // first increment res, err := counterA.Increment(ctx, 1) assert.NoError(t, err) assert.EqualValues(t, 1, res, fmt.Sprintf("iteration %d", i)) - - syncData(a, b) + testutils.SyncData(a, b) // increment from another replica res, err = counterB.Increment(ctx, 1) assert.NoError(t, err) assert.EqualValues(t, 2, res, fmt.Sprintf("iteration %d", i)) - - syncData(a, b) + testutils.SyncData(a, b) // this increment does not account data from other replica because current value is cached res, err = counterA.Increment(ctx, 1) diff --git a/examples/plain_object_test.go b/examples/plain_object_test.go index 5a5dd35..53f3f1d 100644 --- a/examples/plain_object_test.go +++ b/examples/plain_object_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/drpcorg/chotki" + "github.com/drpcorg/chotki/classes" "github.com/drpcorg/chotki/rdx" "github.com/stretchr/testify/assert" ) @@ -19,9 +20,9 @@ func TestPlainObjectORM(t *testing.T) { orma := a.ObjectMapper() tid, err := a.NewClass(context.Background(), rdx.ID0, - chotki.Field{Name: "Name", RdxType: rdx.String}, - chotki.Field{Name: "Group", RdxType: rdx.Reference}, - chotki.Field{Name: "Score", RdxType: rdx.Natural}, + classes.Field{Name: "Name", RdxType: rdx.String}, + classes.Field{Name: "Group", RdxType: rdx.Reference}, + classes.Field{Name: "Score", RdxType: rdx.Natural}, ) assert.Nil(t, err) sidorov := Student{ diff --git a/host/helpers.go b/host/helpers.go new file mode 100644 index 0000000..d96cf20 --- /dev/null +++ b/host/helpers.go @@ -0,0 +1,53 @@ +package host + +import ( + "encoding/binary" + + "github.com/drpcorg/chotki/rdx" +) + +const SyncBlockBits = 28 +const SyncBlockMask = uint64((1 << SyncBlockBits) - 1) + +func OKey(id rdx.ID, rdt byte) (key []byte) { + var ret = [18]byte{'O'} + key = binary.BigEndian.AppendUint64(ret[:1], id.Src()) + key = binary.BigEndian.AppendUint64(key, id.Pro()) + key = append(key, rdt) + return +} + +const LidLKeyLen = 1 + 16 + 1 + +func OKeyIdRdt(key []byte) (id rdx.ID, rdt byte) { + if len(key) != LidLKeyLen { + return rdx.BadId, 0 + } + + id = rdx.IDFromBytes(key[1 : LidLKeyLen-1]) + rdt = key[LidLKeyLen-1] + return +} + +var VKey0 = []byte{'V', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'V'} + +func VKey(id rdx.ID) (key []byte) { + var ret = [18]byte{'V'} + block := id.ProOr(SyncBlockMask) + key = binary.BigEndian.AppendUint64(ret[:1], block.Src()) + key = binary.BigEndian.AppendUint64(key, block.Pro()) + key = append(key, 'V') + return +} + +func VKeyId(key []byte) rdx.ID { + if len(key) != LidLKeyLen { + return rdx.BadId + } + return rdx.IDFromBytes(key[1:]).ProAnd(^SyncBlockMask) +} + +func ObjectKeyRange(oid rdx.ID) (fro, til []byte) { + oid = oid.ZeroOff() + return OKey(oid, 'O'), OKey(oid.IncPro(1), 0) +} diff --git a/host/host.go b/host/host.go new file mode 100644 index 0000000..984f4e3 --- /dev/null +++ b/host/host.go @@ -0,0 +1,26 @@ +package host + +import ( + "context" + + "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki/classes" + "github.com/drpcorg/chotki/protocol" + "github.com/drpcorg/chotki/rdx" + "github.com/drpcorg/chotki/utils" +) + +type Host interface { + ClassFields(cid rdx.ID) (fields classes.Fields, err error) + GetFieldTLV(id rdx.ID) (rdt byte, tlv []byte) + Logger() utils.Logger + Last() rdx.ID + Source() uint64 + WriteOptions() *pebble.WriteOptions + Database() *pebble.DB + ObjectFieldTLV(fid rdx.ID) (rdt byte, tlv []byte, err error) + CommitPacket(ctx context.Context, lit byte, ref rdx.ID, body protocol.Records) (id rdx.ID, err error) + Broadcast(ctx context.Context, records protocol.Records, except string) + Drain(ctx context.Context, recs protocol.Records) (err error) + Snapshot() pebble.Reader +} diff --git a/indexes/doc.go b/indexes/doc.go new file mode 100644 index 0000000..dbfadca --- /dev/null +++ b/indexes/doc.go @@ -0,0 +1,118 @@ +// Package indexes provides the index management subsystem for Chotki. +// +// # Overview +// +// IndexManager keeps two kinds of indexes for class data: +// +// 1. Fullscan index (implicit per class) +// A chronological list of all object IDs that belong to a class. It is +// optimized for scanning a whole class. No range or value queries; O(n). +// +// 2. Hashtable index (opt-in per field) +// A hash from a field value to the object ID that holds it. Lookups are +// O(1) on average. Only one object per class+field value is allowed; +// inserting a duplicate returns ErrHashIndexUinqueConstraintViolation. +// +// # Key layout in Pebble +// +// All keys start with 'I'. +// +// - Fullscan index: "IF" + class_id + object_id + 'T' -> empty value +// +// - Hashtable index: "IH" + class_id + field_id(u32, BE) + hash(u64, BE) + +// 'E' -> TLV-encoded set of object RIDs. We still enforce uniqueness, so at +// most one RID remains for a value. +// +// - Reindex tasks: "IT" + class_id + 'M' -> TLV-encoded map +// Keys are field indices (u32). Values store task state and last update. +// +// # Integration with writes +// +// IndexManager runs on the write path. Index updates are written in the same +// Pebble batch as the object change. A write either commits object+index +// together or not at all. This keeps data and index consistent, even on +// crashes. Changes can arrive in two ways: +// +// Realtime synchronization (live) +// +// - Object create/update events arrive in order on an already +// initialized replica. +// - AddFullScanIndex appends a class membership entry for a new object. +// - OnFieldUpdate checks if a field is indexed. If yes, it hashes the FIRST +// payload and merges the mapping into the hashtable index. Both changes are +// in the same batch, so the index matches the object data. +// +// Diff synchronization (bootstrap/catch-up) +// +// - During bootstrap or a large diff, classes and objects may arrive in the +// same window. +// - Early in sync, ClassFields may not be readable yet. Then OnFieldUpdate +// cannot tell if a field is indexed, so it cannot update the hashtable +// safely. +// - In that case we enqueue a reindex task for class+field (in the same +// batch as the object write) and skip the hashtable write. Fullscan entries +// are still added for new objects. +// - After sync completes, CheckReindexTasks runs the task. It scans objects +// via fullscan and rebuilds the hashtable. Indexes catch up without ever +// writing partial entries. + +// Consistency guarantee +// +// - Realtime: object data and indexes commit in one batch. +// - Diff sync: if inline indexing is not possible, we only enqueue a reindex +// task (in the same batch) and defer index writes. The background reindex +// rebuilds from a snapshot. +// +// The index never contradicts committed object data. +// +// Query helpers +// +// - SeekClass iterates object IDs that belong to a class using the fullscan +// index. +// +// - GetByHash resolves a class+field+value (FIRST payload) to the object ID +// using the hashtable index. A small in-memory LRU cache accelerates +// repeat lookups. +// +// # Reindexing lifecycle +// +// Index definitions are part of class definitions. When a class is created or +// updated (e.g., an index is added or removed for a field), HandleClassUpdate +// emits reindex tasks that are persisted under the task key. A background +// scanner (CheckReindexTasks) monitors tasks and runs them via runReindexTask. +// +// Task states are stored as bytes and surfaced via Prometheus metrics. The +// lifecycle is: +// +// - Pending: task is scheduled and will be picked up +// - InProgress: task is running +// - Done: task finished successfully +// - Remove: index was deleted; task stays in this state and is ignored +// +// A reindex pass operates on a consistent snapshot and performs two phases: +// +// 1. Repair missing entries: for every object in the class (via fullscan), +// compute the hash for the field and ensure the corresponding hashtable +// entry exists. +// +// 2. Remove stale entries: scan the index keys for the class+field and +// drop entries that point to non-existent objects, to non-FIRST values, or +// to values whose hash no longer matches the object field. +// +// When an index is removed from a field definition, the manager deletes the +// corresponding IH range and then sets the task to Remove. We keep the task +// (do not delete it) and simply ignore it later. "Done" tasks may be +// periodically rescheduled for self-healing; "Remove" tasks are not. +// +// # Caching and concurrency +// +// The manager keeps small caches: +// - classCache: object_id -> class_id +// - hashIndexCache: (class_id, field_id, value) -> object_id +// +// Writes to the same class field index are serialized with a per-field mutex. +// +// # Metrics +// +// Prometheus metrics report task counts, states, durations, and results. +package indexes diff --git a/index_manager.go b/indexes/index_manager.go similarity index 78% rename from index_manager.go rename to indexes/index_manager.go index f4b53b9..f9a64d3 100644 --- a/index_manager.go +++ b/indexes/index_manager.go @@ -1,4 +1,4 @@ -package chotki +package indexes import ( "bytes" @@ -13,13 +13,14 @@ import ( "github.com/cespare/xxhash" "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki/chotki_errors" + "github.com/drpcorg/chotki/classes" + "github.com/drpcorg/chotki/host" "github.com/drpcorg/chotki/rdx" lru "github.com/hashicorp/golang-lru/v2" "github.com/prometheus/client_golang/prometheus" ) -type IndexType byte - var ReindexTaskCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "chotki", Subsystem: "index_manager", @@ -51,11 +52,6 @@ var ReindexDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Buckets: []float64{0, 1, 5, 10, 20, 50, 100, 200, 500}, }, []string{"class", "field"}) -const ( - HashIndex IndexType = 'H' - FullscanIndex IndexType = 'F' -) - type reindexTaskState byte const ( @@ -66,7 +62,7 @@ const ( ) type IndexManager struct { - c *Chotki + c host.Host tasksCancels map[string]context.CancelFunc taskEntries sync.Map mutexMap sync.Map @@ -74,7 +70,7 @@ type IndexManager struct { hashIndexCache *lru.Cache[string, rdx.ID] } -func newIndexManager(c *Chotki) *IndexManager { +func NewIndexManager(c host.Host) *IndexManager { cache, _ := lru.New[rdx.ID, rdx.ID](10000) hashCache, _ := lru.New[string, rdx.ID](100000) return &IndexManager{ @@ -108,7 +104,7 @@ func hashKey(cid rdx.ID, fid uint32, hash uint64) []byte { return key } -type reindexTask struct { +type ReindexTask struct { State reindexTaskState LastUpdate time.Time Cid rdx.ID @@ -117,15 +113,15 @@ type reindexTask struct { Src uint64 } -func (t *reindexTask) Key() []byte { +func (t *ReindexTask) Key() []byte { return append(append([]byte{'I', 'T'}, t.Cid.Bytes()...), 'M') } -func (t *reindexTask) Id() string { +func (t *ReindexTask) Id() string { return fmt.Sprintf("%s:%d", t.Cid.String(), t.Field) } -func (t *reindexTask) Value() []byte { +func (t *ReindexTask) Value() []byte { mp := rdx.NewStampedMap[rdx.RdxInt, rdx.RdxString]() data := []byte{byte(t.State)} extime := uint64(t.LastUpdate.Unix()) @@ -134,19 +130,19 @@ func (t *reindexTask) Value() []byte { return mp.Tlv() } -func parseReindexTasks(key, value []byte) ([]reindexTask, error) { +func parseReindexTasks(key, value []byte) ([]ReindexTask, error) { cid := rdx.IDFromBytes(key[2:18]) mp := rdx.NewStampedMap[rdx.RdxInt, rdx.RdxString]() err := mp.Native(value) if err != nil { return nil, err } - tasks := []reindexTask{} + tasks := []ReindexTask{} for k, v := range mp.Map { state := reindexTaskState(v.Value[0]) extime := int64(binary.BigEndian.Uint64([]byte(v.Value[1:]))) updatetime := time.Unix(extime, 0) - tasks = append(tasks, reindexTask{ + tasks = append(tasks, ReindexTask{ State: state, Revision: v.Time.Rev, Src: v.Time.Src, @@ -158,12 +154,12 @@ func parseReindexTasks(key, value []byte) ([]reindexTask, error) { return tasks, nil } -func (im *IndexManager) addFullScanIndex(cid rdx.ID, oid rdx.ID, batch *pebble.Batch) error { +func (im *IndexManager) AddFullScanIndex(cid rdx.ID, oid rdx.ID, batch *pebble.Batch) error { im.classCache.Add(oid, cid) return batch.Merge( fullScanKey(cid, oid), []byte{}, - im.c.opts.PebbleWriteOptions, + im.c.WriteOptions(), ) } @@ -180,7 +176,7 @@ func (im *IndexManager) GetByHash(cid rdx.ID, fid uint32, otlv []byte, reader pe defer closer.Close() } if err == pebble.ErrNotFound { - return rdx.BadId, ErrObjectUnknown + return rdx.BadId, chotki_errors.ErrObjectUnknown } if err != nil { return rdx.BadId, err @@ -198,7 +194,7 @@ func (im *IndexManager) GetByHash(cid rdx.ID, fid uint32, otlv []byte, reader pe return rdx.ID(id), nil } } - return rdx.BadId, ErrObjectUnknown + return rdx.BadId, chotki_errors.ErrObjectUnknown } func (im *IndexManager) SeekClass(cid rdx.ID, reader pebble.Reader) iter.Seq[rdx.ID] { @@ -217,23 +213,23 @@ func (im *IndexManager) SeekClass(cid rdx.ID, reader pebble.Reader) iter.Seq[rdx } } -func (im *IndexManager) HandleClassUpdate(id rdx.ID, cid rdx.ID, newFieldsBody []byte) ([]reindexTask, error) { - tasks := []reindexTask{} +func (im *IndexManager) HandleClassUpdate(id rdx.ID, cid rdx.ID, newFieldsBody []byte) ([]ReindexTask, error) { + tasks := []ReindexTask{} - newFields := parseClass(newFieldsBody) + newFields := classes.ParseClass(newFieldsBody) for i, newField := range newFields { - if newField.Index == HashIndex { + if newField.Index == classes.HashIndex { oldFields, err := im.c.ClassFields(cid) - if err == ErrTypeUnknown { + if err == chotki_errors.ErrTypeUnknown { // new class, everything is new, create task ReindexTaskCount.WithLabelValues(id.String(), fmt.Sprintf("%d", i), "new_class_same_batch").Inc() - task := &reindexTask{ + task := &ReindexTask{ State: reindexTaskStatePending, Cid: id, Field: uint32(i), - Revision: int64(im.c.last.Pro()), - Src: im.c.src, + Revision: int64(im.c.Last().Pro()), + Src: im.c.Source(), LastUpdate: time.Now(), } tasks = append(tasks, *task) @@ -246,26 +242,26 @@ func (im *IndexManager) HandleClassUpdate(id rdx.ID, cid rdx.ID, newFieldsBody [ // field just created with index, no need to reindex continue } - if oldFields[oldField].Index != HashIndex && newField.Index == HashIndex { + if oldFields[oldField].Index != classes.HashIndex && newField.Index == classes.HashIndex { ReindexTaskCount.WithLabelValues(id.String(), fmt.Sprintf("%d", i), "created_new_index").Inc() - task := &reindexTask{ + task := &ReindexTask{ State: reindexTaskStatePending, Cid: cid, Field: uint32(oldField), - Revision: int64(im.c.last.Pro()), - Src: im.c.src, + Revision: int64(im.c.Last().Pro()), + Src: im.c.Source(), LastUpdate: time.Now(), } tasks = append(tasks, *task) } - if oldFields[oldField].Index == HashIndex && newField.Index != HashIndex { + if oldFields[oldField].Index == classes.HashIndex && newField.Index != classes.HashIndex { ReindexTaskCount.WithLabelValues(id.String(), fmt.Sprintf("%d", i), "deleted_index").Inc() - task := &reindexTask{ + task := &ReindexTask{ State: reindexTaskStatePending, Cid: cid, Field: uint32(oldField), - Revision: int64(im.c.last.Pro()), - Src: im.c.src, + Revision: int64(im.c.Last().Pro()), + Src: im.c.Source(), LastUpdate: time.Now(), } tasks = append(tasks, *task) @@ -278,7 +274,7 @@ func (im *IndexManager) HandleClassUpdate(id rdx.ID, cid rdx.ID, newFieldsBody [ func (im *IndexManager) CheckReindexTasks(ctx context.Context) { cycle := func() { - iter := im.c.db.NewIter(&pebble.IterOptions{ + iter := im.c.Database().NewIter(&pebble.IterOptions{ LowerBound: []byte{'I', 'T'}, UpperBound: []byte{'I', 'U'}, }) @@ -286,7 +282,7 @@ func (im *IndexManager) CheckReindexTasks(ctx context.Context) { for valid := iter.First(); valid; valid = iter.Next() { tasks, err := parseReindexTasks(iter.Key(), iter.Value()) if err != nil { - im.c.log.ErrorCtx(ctx, "failed to parse reindex tasks: %s", err) + im.c.Logger().ErrorCtx(ctx, "failed to parse reindex tasks: %s", err) continue } for _, task := range tasks { @@ -337,7 +333,7 @@ func (im *IndexManager) CheckReindexTasks(ctx context.Context) { task.Revision++ task.LastUpdate = time.Now() ReindexTaskCount.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "scheduled_reindex").Inc() - im.c.db.Merge(task.Key(), task.Value(), im.c.opts.PebbleWriteOptions) + im.c.Database().Merge(task.Key(), task.Value(), im.c.WriteOptions()) } } } @@ -357,14 +353,14 @@ func (im *IndexManager) addHashIndex(cid rdx.ID, fid rdx.ID, tlv []byte, batch p mt.Unlock() im.mutexMap.Delete(fid) }() - id, err := im.GetByHash(cid, uint32(fid.Off()), tlv, im.c.db) + id, err := im.GetByHash(cid, uint32(fid.Off()), tlv, im.c.Database()) switch err { case nil: if id != fid.ZeroOff() { - return errors.Join(ErrHashIndexUinqueConstraintViolation, fmt.Errorf("key %s, current id %s, new id %s", string(tlv), id.String(), fid.ZeroOff().String())) + return errors.Join(chotki_errors.ErrHashIndexUinqueConstraintViolation, fmt.Errorf("key %s, current id %s, new id %s", string(tlv), id.String(), fid.ZeroOff().String())) } fallthrough - case ErrObjectUnknown: + case chotki_errors.ErrObjectUnknown: cacheKey := append(binary.BigEndian.AppendUint32(cid.Bytes(), uint32(fid.Off())), tlv...) im.hashIndexCache.Remove(string(cacheKey)) hash := xxhash.Sum64(tlv) @@ -374,7 +370,7 @@ func (im *IndexManager) addHashIndex(cid rdx.ID, fid rdx.ID, tlv []byte, batch p return batch.Merge( key, set.Tlv(), - im.c.opts.PebbleWriteOptions, + im.c.WriteOptions(), ) default: return err @@ -402,28 +398,28 @@ func (im *IndexManager) OnFieldUpdate(rdt byte, fid, cid rdx.ID, tlv []byte, bat } fields, err := im.c.ClassFields(cid) if err != nil { - task := &reindexTask{ + task := &ReindexTask{ State: reindexTaskStatePending, Cid: cid, Field: uint32(fid.Off()), - Revision: int64(im.c.last.Pro()), - Src: im.c.src, + Revision: int64(im.c.Last().Pro()), + Src: im.c.Source(), LastUpdate: time.Now(), } - return batch.Merge(task.Key(), task.Value(), im.c.opts.PebbleWriteOptions) + return batch.Merge(task.Key(), task.Value(), im.c.WriteOptions()) } if int(fid.Off()) >= len(fields) { return nil } field := fields[fid.Off()] - if field.Index == HashIndex { + if field.Index == classes.HashIndex { _, _, tlv := rdx.ParseFIRST(tlv) return im.addHashIndex(cid, fid, tlv, batch) } return nil } -func (im *IndexManager) runReindexTask(ctx context.Context, task *reindexTask) { +func (im *IndexManager) runReindexTask(ctx context.Context, task *ReindexTask) { start := time.Now() ReindexCount.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field)).Inc() defer im.taskEntries.CompareAndDelete(task.Id(), task.Revision) @@ -431,45 +427,45 @@ func (im *IndexManager) runReindexTask(ctx context.Context, task *reindexTask) { task.State = reindexTaskStateInProgress task.LastUpdate = time.Now() task.Revision++ - ctx = im.c.log.WithDefaultArgs(ctx, "cid", task.Cid.String(), "field", fmt.Sprintf("%d", task.Field), "process", "reindex") - err := im.c.db.Merge(task.Key(), task.Value(), im.c.opts.PebbleWriteOptions) + ctx = im.c.Logger().WithDefaultArgs(ctx, "cid", task.Cid.String(), "field", fmt.Sprintf("%d", task.Field), "process", "reindex") + err := im.c.Database().Merge(task.Key(), task.Value(), im.c.WriteOptions()) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_set_in_progress").Inc() - im.c.log.ErrorCtx(ctx, "failed to set reindex task to in progress: %s, restarting", err) + im.c.Logger().ErrorCtx(ctx, "failed to set reindex task to in progress: %s, restarting", err) return } fields, err := im.c.ClassFields(task.Cid) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_get_class_fields").Inc() - im.c.log.ErrorCtx(ctx, "failed to get class fields: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to get class fields: %s, will restart", err) return } if int(task.Field) >= len(fields) { - im.c.log.ErrorCtx(ctx, "field out of range, will restart", "field", task.Field, "class", task.Cid.String(), "fields", fields) + im.c.Logger().ErrorCtx(ctx, "field out of range, will restart", "field", task.Field, "class", task.Cid.String(), "fields", fields) return } field := fields[task.Field] if field.Index == 0 { - err := im.c.db.DeleteRange( + err := im.c.Database().DeleteRange( hashKey(task.Cid, uint32(task.Field), 0), hashKey(task.Cid, uint32(task.Field), math.MaxUint64), - im.c.opts.PebbleWriteOptions, + im.c.WriteOptions(), ) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_delete_hash_index").Inc() - im.c.log.ErrorCtx(ctx, "failed to delete hash index: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to delete hash index: %s, will restart", err) return } task.State = reindexTaskStateRemove task.Revision++ task.LastUpdate = time.Now() - err = im.c.db.Merge(task.Key(), task.Value(), im.c.opts.PebbleWriteOptions) + err = im.c.Database().Merge(task.Key(), task.Value(), im.c.WriteOptions()) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_save_done_task").Inc() - im.c.log.ErrorCtx(ctx, "failed to save done task: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to save done task: %s, will restart", err) return } ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "success", "deleted_hash_index").Inc() @@ -477,7 +473,7 @@ func (im *IndexManager) runReindexTask(ctx context.Context, task *reindexTask) { } // check data in snapshot, because we don't need to index new objects - snap := im.c.db.NewSnapshot() + snap := im.c.Database().NewSnapshot() defer snap.Close() // repair index missing objects for id := range im.SeekClass(task.Cid, snap) { @@ -488,32 +484,32 @@ func (im *IndexManager) runReindexTask(ctx context.Context, task *reindexTask) { rdt, tlv, err := im.c.ObjectFieldTLV(fid) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_get_object_field_tlv").Inc() - im.c.log.ErrorCtx(ctx, "failed to get object field tlv: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to get object field tlv: %s, will restart", err) return } if !rdx.IsFirst(rdt) { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "object_field_is_not_first").Inc() - im.c.log.ErrorCtx(ctx, "object field is not first, skipping") + im.c.Logger().ErrorCtx(ctx, "object field is not first, skipping") continue } // unpack FIRST _, _, tlv = rdx.ParseFIRST(tlv) - _, err = im.GetByHash(task.Cid, uint32(fid.Off()), tlv, im.c.db) - if err == ErrObjectUnknown { - err = im.addHashIndex(task.Cid, fid, tlv, im.c.db) + _, err = im.GetByHash(task.Cid, uint32(fid.Off()), tlv, im.c.Database()) + if err == chotki_errors.ErrObjectUnknown { + err = im.addHashIndex(task.Cid, fid, tlv, im.c.Database()) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_add_hash_index").Inc() - im.c.log.ErrorCtx(ctx, "failed to add hash index: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to add hash index: %s, will restart", err) return } } else if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_get_object_by_hash").Inc() - im.c.log.ErrorCtx(ctx, "failed to get object by hash: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to get object by hash: %s, will restart", err) return } } // repair index entries that are no longer needed - indexIter := im.c.db.NewIter(&pebble.IterOptions{ + indexIter := im.c.Database().NewIter(&pebble.IterOptions{ LowerBound: hashKey(task.Cid, uint32(task.Field), 0), UpperBound: hashKey(task.Cid, uint32(task.Field), math.MaxUint64), }) @@ -523,18 +519,18 @@ func (im *IndexManager) runReindexTask(ctx context.Context, task *reindexTask) { err := set.Native(indexIter.Value()) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_parse_index_set").Inc() - im.c.log.ErrorCtx(ctx, "failed to parse index set: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to parse index set: %s, will restart", err) return } for id := range set.Value { rdt, tlv := im.c.GetFieldTLV(rdx.ID(id).ToOff(uint64(task.Field))) // index pointing nowhere, delete if tlv == nil { - im.c.db.Delete(indexIter.Key(), im.c.opts.PebbleWriteOptions) + im.c.Database().Delete(indexIter.Key(), im.c.WriteOptions()) } else { // likely not possible, but delete if !rdx.IsFirst(rdt) { - im.c.db.Delete(indexIter.Key(), im.c.opts.PebbleWriteOptions) + im.c.Database().Delete(indexIter.Key(), im.c.WriteOptions()) continue } _, _, btlv := rdx.ParseFIRST(tlv) @@ -542,7 +538,7 @@ func (im *IndexManager) runReindexTask(ctx context.Context, task *reindexTask) { indexHash := binary.BigEndian.Uint64(indexIter.Key()[len(indexIter.Key())-9 : len(indexIter.Key())]) if hash != indexHash { // the indexed value has changed, delete - im.c.db.Delete(indexIter.Key(), im.c.opts.PebbleWriteOptions) + im.c.Database().Delete(indexIter.Key(), im.c.WriteOptions()) } } } @@ -550,10 +546,10 @@ func (im *IndexManager) runReindexTask(ctx context.Context, task *reindexTask) { task.State = reindexTaskStateDone task.LastUpdate = time.Now() task.Revision++ - err = im.c.db.Merge(task.Key(), task.Value(), im.c.opts.PebbleWriteOptions) + err = im.c.Database().Merge(task.Key(), task.Value(), im.c.WriteOptions()) if err != nil { ReindexResults.WithLabelValues(task.Cid.String(), fmt.Sprintf("%d", task.Field), "error", "fail_to_save_done_task").Inc() - im.c.log.ErrorCtx(ctx, "failed to save reindex task: %s, will restart", err) + im.c.Logger().ErrorCtx(ctx, "failed to save reindex task: %s, will restart", err) return } if ctx.Err() == nil { diff --git a/index_manager_test.go b/indexes/index_manager_test.go similarity index 94% rename from index_manager_test.go rename to indexes/index_manager_test.go index e8c0055..efd3144 100644 --- a/index_manager_test.go +++ b/indexes/index_manager_test.go @@ -1,4 +1,4 @@ -package chotki +package indexes import ( "testing" @@ -10,7 +10,7 @@ import ( func TestIndexManager_ParseIndexTasks(t *testing.T) { time := time.Now() - task := &reindexTask{ + task := &ReindexTask{ State: reindexTaskStateDone, Field: 21, LastUpdate: time, diff --git a/network/net.go b/network/net.go new file mode 100644 index 0000000..8276dc5 --- /dev/null +++ b/network/net.go @@ -0,0 +1,602 @@ +// Package network provides a high-performance TCP/TLS server and client implementation +// for real-time asynchronous communication. This package is designed for continuous +// bidirectional message streaming with high throughput and low latency, unlike +// traditional request-response patterns (like HTTP). +// +// ARCHITECTURE OVERVIEW +// ==================== +// +// The network package uses a callback-based architecture where you provide a Protocol +// Handler that already knows how to process data. The Net layer provides the transport +// mechanism, while your Protocol Handler handles the actual data processing and +// business logic. +// +// Key Components: +// - Net: Manages connections, listeners, and network transport +// - Peer: Handles individual connection lifecycle and data buffering +// - Protocol Handler: Your application logic for data processing (Feed/Drain) +// +// Key Features: +// - Support for TCP and TLS protocols +// - Automatic connection management with exponential backoff retry logic +// - Configurable buffer sizes and processing thresholds +// - Thread-safe concurrent operations with goroutines +// - Graceful connection handling and resource cleanup +// - Bidirectional data streaming with buffering and batching +// +// CONNECTION ESTABLISHMENT FLOW +// ============================= +// +// The network package uses a callback-based architecture where you provide a Protocol Handler +// that already knows how to process data. When you create a Net instance with NewNet(), +// you pass an install callback that returns a FeedDrainCloserTraced interface. This +// Protocol Handler is the core component that: +// - Implements Feed() for outgoing data (application → network) +// - Implements Drain() for incoming data (network → application) +// - Handles protocol parsing and message processing +// - Contains the business logic for data handling +// +// When connections are established, the Net layer calls your install callback to get +// a Protocol Handler instance for each connection, and the Peer uses this handler +// for all data processing through the Feed()/Drain() methods. +// +// When you call Connect("tcp://localhost:8080"), here's what happens: +// +// 1. Connect() calls ConnectPool() with a single address +// - This creates a connection pool entry with the name "tcp://localhost:8080" +// - The entry is initially set to nil to prevent duplicate connections +// +// 2. ConnectPool() spawns a goroutine running KeepConnecting() +// - This goroutine runs continuously until the network is closed +// - It implements the retry logic with exponential backoff +// +// 3. KeepConnecting() attempts to establish the connection: +// - Calls createConn() to create a TCP/TLS connection +// - If connection fails, it waits with exponential backoff (0.5s → 1s → 2s → ... → 60s max) +// - If connection succeeds, it calls keepPeer() to manage the connection +// +// 4. keepPeer() creates a new Peer instance: +// - Calls the install callback to get a protocol handler +// - Creates a Peer with the connection and configuration +// - Stores the Peer in the connections map +// - Calls Peer.Keep() to start read/write loops +// +// 5. Peer.Keep() runs two goroutines: +// - keepRead(): Continuously reads from the socket, buffers data, and calls protocol.Drain() +// - keepWrite(): Continuously calls protocol.Feed() and writes to the socket +// +// 6. If the connection fails or is closed: +// - The Peer is removed from the connections map +// - The destroy callback is called +// - KeepConnecting() continues and will retry the connection +// +// READ BUFFERING STRATEGY +// ======================= +// +// The read buffering system accumulates data from the network socket until +// specific thresholds are met. This batching is crucial because the larger +// the batch, the fewer resources are consumed by the Protocol Handler +// (the entity passed during Net creation) for saving and processing data. +// +// Buffering Thresholds: +// - bufferMinToProcess: Minimum data to accumulate before processing +// - bufferMaxSize: Maximum buffer size to prevent memory exhaustion +// - readAccumTimeLimit: Maximum time to wait for more data (default: 5s) +// +// Processing Triggers: +// Buffer is processed when ANY condition is met: +// 1. Buffer size reaches bufferMinToProcess (efficiency trigger) +// 2. Buffer size reaches bufferMaxSize (memory protection trigger) +// 3. Time since last read exceeds readAccumTimeLimit (latency trigger) +// +// Key Benefits: +// - Larger batches reduce CPU overhead for Protocol Handler operations +// - Fewer calls to Protocol.Drain() means better performance +// - Balances latency (small batches) vs efficiency (large batches) +// - Concurrent processing prevents blocking network reads +// +// Usage Example: +// +// // Create a new network instance with your protocol handler +// net := NewNet(logger, installCallback, destroyCallback, +// &NetTlsConfigOpt{Config: tlsConfig}, +// &NetWriteTimeoutOpt{Timeout: 30 * time.Second}, +// ) +// +// // Start listening for incoming connections +// err := net.Listen("tcp://:8080") +// +// // Connect to a remote peer +// err = net.Connect("tcp://localhost:8080") +// +// // Clean up when done +// defer net.Close() +package network + +import ( + "context" + "crypto/tls" + "errors" + "fmt" + "net" + "net/url" + "strings" + "sync" + "time" + + "github.com/drpcorg/chotki/protocol" + "github.com/drpcorg/chotki/utils" + "github.com/google/uuid" + "github.com/puzpuzpuz/xsync/v3" +) + +// ConnType represents the type of network connection +type ConnType = uint + +var ( + // ErrAddressInvalid is returned when the provided address format is invalid + ErrAddressInvalid = errors.New("the address invalid") + // ErrAddressDuplicated is returned when attempting to use an address that's already in use + ErrAddressDuplicated = errors.New("the address already used") + // ErrAddressUnknown is returned when trying to disconnect from an unknown address + ErrAddressUnknown = errors.New("address unknown") + // ErrDisconnected is returned when a connection is closed by the user + ErrDisconnected = errors.New("disconnected by user") +) + +const ( + TCP ConnType = iota + 1 + TLS + QUIC +) + +const ( + // TYPICAL_MTU is the typical Maximum Transmission Unit size + TYPICAL_MTU = 1500 + // MAX_OUT_QUEUE_LEN is the maximum length of the output queue (16MB of pointers) + MAX_OUT_QUEUE_LEN = 1 << 20 + + // MAX_RETRY_PERIOD is the maximum time to wait between connection retry attempts + MAX_RETRY_PERIOD = time.Minute + // MIN_RETRY_PERIOD is the minimum time to wait between connection retry attempts + MIN_RETRY_PERIOD = time.Second / 2 +) + +type InstallCallback func(name string) protocol.FeedDrainCloserTraced +type DestroyCallback func(name string, p protocol.Traced) + +// Net provides a TCP/TLS/QUIC server/client for real-time async communication. +// Unlike request-response patterns (like HTTP), this implementation constantly +// sends many tiny messages without waiting for responses. This requires different +// work patterns than typical HTTP/RPC servers, as one slow receiver cannot delay +// event transmission to all other receivers. +type Net struct { + wg sync.WaitGroup + log utils.Logger + onInstall InstallCallback + onDestroy DestroyCallback + + conns *xsync.MapOf[string, *Peer] + listens *xsync.MapOf[string, net.Listener] + ctx context.Context + cancelCtx context.CancelFunc + + tlsConfig *tls.Config + readBufferTcpSize int + writeBufferTcpSize int + readAccumTimeLimit time.Duration + writeTimeout time.Duration + bufferMaxSize int + bufferMinToProcess int +} + +type NetOpt interface { + Apply(*Net) +} + +type NetWriteTimeoutOpt struct { + Timeout time.Duration +} + +func (opt *NetWriteTimeoutOpt) Apply(n *Net) { + n.writeTimeout = opt.Timeout +} + +type NetTlsConfigOpt struct { + Config *tls.Config +} + +func (opt *NetTlsConfigOpt) Apply(n *Net) { + n.tlsConfig = opt.Config +} + +type NetReadBatchOpt struct { + ReadAccumTimeLimit time.Duration + BufferMaxSize int + BufferMinToProcess int +} + +func (opt *NetReadBatchOpt) Apply(n *Net) { + n.readAccumTimeLimit = opt.ReadAccumTimeLimit + n.bufferMaxSize = opt.BufferMaxSize + n.bufferMinToProcess = opt.BufferMinToProcess +} + +type TcpBufferSizeOpt struct { + Read int + Write int +} + +func (opt *TcpBufferSizeOpt) Apply(n *Net) { + n.readBufferTcpSize = opt.Read + n.writeBufferTcpSize = opt.Write +} + +// NewNet creates a new network instance with the specified logger and callbacks. +// Additional configuration can be provided through NetOpt parameters. +// +// Example: +// +// net := NewNet(logger, installCallback, destroyCallback, +// &NetTlsConfigOpt{Config: tlsConfig}, +// &NetWriteTimeoutOpt{Timeout: 30 * time.Second}, +// ) +func NewNet(log utils.Logger, install InstallCallback, destroy DestroyCallback, opts ...NetOpt) *Net { + ctx, cancel := context.WithCancel(context.Background()) + net := &Net{ + log: log, + cancelCtx: cancel, + ctx: ctx, + conns: xsync.NewMapOf[string, *Peer](), + listens: xsync.NewMapOf[string, net.Listener](), + onInstall: install, + onDestroy: destroy, + } + for _, o := range opts { + o.Apply(net) + } + return net +} + +type NetStats struct { + ReadBuffers map[string]int32 + WriteBatches map[string]int32 +} + +func (n *Net) GetStats() NetStats { + stats := NetStats{ + ReadBuffers: make(map[string]int32), + WriteBatches: make(map[string]int32), + } + n.conns.Range(func(name string, peer *Peer) bool { + if peer != nil { + stats.ReadBuffers[name] = peer.GetIncomingPacketBufferSize() + stats.WriteBatches[name] = int32(peer.writeBatchSize.Val()) + } + return true + }) + return stats +} + +func (n *Net) Close() error { + n.cancelCtx() + + n.listens.Range(func(_ string, v net.Listener) bool { + v.Close() + return true + }) + n.listens.Clear() + + n.conns.Range(func(_ string, p *Peer) bool { + // sometimes it can be nil when we started connecting, but haven't connected yet + if p != nil { + p.Close() + } + return true + }) + n.conns.Clear() + + n.wg.Wait() + return nil +} + +func (n *Net) Connect(addr string) (err error) { + return n.ConnectPool(addr, []string{addr}) +} + +// ConnectPool establishes connections to multiple addresses with automatic failover. +// The connection will attempt to connect to each address in the provided list, +// and will retry with exponential backoff if all addresses fail. +// +// The name parameter is used to identify this connection pool in logs and callbacks. +func (n *Net) ConnectPool(name string, addrs []string) (err error) { + // nil is needed so that Connect cannot be called + // while KeepConnecting is connects + if _, ok := n.conns.LoadOrStore(name, nil); ok { + return ErrAddressDuplicated + } + + n.wg.Add(1) + go func() { + n.KeepConnecting(fmt.Sprintf("connect:%s", name), addrs) + n.wg.Done() + }() + + return nil +} + +func (de *Net) Disconnect(name string) (err error) { + conn, ok := de.conns.LoadAndDelete(name) + if !ok { + return ErrAddressUnknown + } + + conn.Close() + return nil +} + +// Listen starts listening for incoming connections on the specified address. +// The address should be in the format "tcp://:port", "tls://:port", or "quic://:port". +// Returns ErrAddressDuplicated if already listening on this address. +func (n *Net) Listen(addr string) error { + // nil is needed so that Listen cannot be called + // while creating listener + if _, ok := n.listens.LoadOrStore(addr, nil); ok { + return ErrAddressDuplicated + } + + listener, err := n.createListener(addr) + if err != nil { + n.listens.Delete(addr) + return err + } + n.listens.Store(addr, listener) + + n.log.Info("net: listening", "addr", addr) + + n.wg.Add(1) + go func() { + n.KeepListening(addr) + n.wg.Done() + }() + + return nil +} + +func (de *Net) Unlisten(addr string) error { + listener, ok := de.listens.LoadAndDelete(addr) + if !ok { + return ErrAddressUnknown + } + + return listener.Close() +} + +// KeepConnecting continuously attempts to maintain a connection to the provided addresses. +// It implements exponential backoff retry logic and will attempt to connect to each +// address in the list until a successful connection is established. +func (n *Net) KeepConnecting(name string, addrs []string) { + connBackoff := MIN_RETRY_PERIOD + for n.ctx.Err() == nil { + var err error + var conn net.Conn + for _, addr := range addrs { + conn, err = n.createConn(addr) + if err == nil { + break + } + } + + if err != nil { + n.log.Error("net: couldn't connect", "name", name, "err", err) + + select { + case <-time.After(connBackoff): + case <-n.ctx.Done(): + break + } + connBackoff = min(MAX_RETRY_PERIOD, connBackoff*2) + + continue + } + n.setTCPBuffersSize(n.log.WithDefaultArgs(context.Background(), "name", name), conn) + n.log.Info("net: connected", "name", name) + + connBackoff = MIN_RETRY_PERIOD + n.keepPeer(name, conn) + } +} + +// setTCPBuffersSize configures TCP buffer sizes for the given connection. +// It handles both plain TCP connections and TLS-wrapped connections. +func (n *Net) setTCPBuffersSize(ctx context.Context, conn net.Conn) { + var tconn *net.TCPConn + switch res := conn.(type) { + case *tls.Conn: + nconn, ok := res.NetConn().(*net.TCPConn) + if !ok { + n.log.WarnCtx(ctx, "net: unable to set buffers, because tls conn is strange") + return + } + tconn = nconn + case *net.TCPConn: + tconn = res + default: + n.log.WarnCtx(ctx, "net: unable to set buffers, because unknown connection type") + return + } + if n.readBufferTcpSize > 0 { + tconn.SetReadBuffer(n.readBufferTcpSize) + } + if n.writeBufferTcpSize > 0 { + tconn.SetWriteBuffer(n.writeBufferTcpSize) + } +} + +// KeepListening continuously accepts incoming connections on the specified address. +// For each accepted connection, it spawns a goroutine to handle the peer communication. +func (n *Net) KeepListening(addr string) { + for n.ctx.Err() == nil { + listener, ok := n.listens.Load(addr) + if !ok { + break + } + + conn, err := listener.Accept() + if err != nil { + if errors.Is(err, net.ErrClosed) { + break + } + + // reconnects are the client's problem, just continue + n.log.Error("net: couldn't accept request", "addr", addr, "err", err) + continue + } + + remoteAddr := conn.RemoteAddr().String() + n.log.Info("net: accept connection", "addr", addr, "remoteAddr", remoteAddr) + n.setTCPBuffersSize(n.log.WithDefaultArgs(context.Background(), "addr", addr, "remoteAdds", remoteAddr), conn) + n.wg.Add(1) + go func() { + n.keepPeer(fmt.Sprintf("listen:%s:%s", uuid.Must(uuid.NewV7()).String(), remoteAddr), conn) + defer n.wg.Done() + }() + } + + if l, ok := n.listens.LoadAndDelete(addr); ok { + if err := l.Close(); err != nil && !errors.Is(err, net.ErrClosed) { + n.log.Error("net: couldn't correct close listener", "addr", addr, "err", err) + } + } + + n.log.Info("net: listener closed", "addr", addr) +} + +// keepPeer manages a single peer connection, handling read/write operations +// and cleanup when the connection is closed. +// +// ARCHITECTURE ROLE: +// - Core peer lifecycle management function +// - Creates and configures Peer instances with protocol handlers +// - Manages peer connection lifecycle and error handling +// - Integrates with the protocol layer via install/destroy callbacks +func (n *Net) keepPeer(name string, conn net.Conn) { + peer := &Peer{ + inout: n.onInstall(name), + conn: conn, + writeTimeout: n.writeTimeout, + readAccumtTimeLimit: n.readAccumTimeLimit, + bufferMaxSize: n.bufferMaxSize, + bufferMinToProcess: n.bufferMinToProcess, + writeBatchSize: &utils.AvgVal{}, + } + n.conns.Store(name, peer) + + readErr, writeErr, closeErr := peer.Keep(n.ctx) + if readErr != nil { + n.log.Error("net: couldn't read from peer", "name", name, "err", readErr, "trace_id", peer.GetTraceId()) + } + if writeErr != nil { + n.log.Error("net: couldn't write to peer", "name", name, "err", writeErr, "trace_id", peer.GetTraceId()) + } + if closeErr != nil { + n.log.Error("net: couldn't correct close peer", "name", name, "err", closeErr, "trace_id", peer.GetTraceId()) + } + + n.conns.Delete(name) + peer.Close() + n.onDestroy(name, peer) +} + +// createListener creates a network listener based on the address scheme. +// Supports TCP, TLS, and QUIC (unimplemented) protocols. +func (n *Net) createListener(addr string) (net.Listener, error) { + connType, address, err := parseAddr(addr) + if err != nil { + return nil, err + } + + var listener net.Listener + switch connType { + case TCP: + config := net.ListenConfig{} + if listener, err = config.Listen(n.ctx, "tcp", address); err != nil { + return nil, err + } + + case TLS: + config := net.ListenConfig{} + if listener, err = config.Listen(n.ctx, "tcp", address); err != nil { + return nil, err + } + + listener = tls.NewListener(listener, n.tlsConfig) + + case QUIC: + return nil, errors.New("QUIC unimplemented") + } + + return listener, nil +} + +// createConn creates a network connection based on the address scheme. +// Supports TCP, TLS, and QUIC (unimplemented) protocols. +func (n *Net) createConn(addr string) (net.Conn, error) { + connType, address, err := parseAddr(addr) + if err != nil { + return nil, err + } + + var conn net.Conn + switch connType { + case TCP: + d := net.Dialer{Timeout: time.Minute} + if conn, err = d.DialContext(n.ctx, "tcp", address); err != nil { + return nil, err + } + + case TLS: + d := tls.Dialer{Config: n.tlsConfig} + + if conn, err = d.DialContext(n.ctx, "tcp", address); err != nil { + return nil, err + } + + case QUIC: + return nil, errors.New("QUIC unimplemented") + } + + return conn, err +} + +// parseAddr parses a network address string and returns the connection type +// and address components. Supports URLs with schemes like "tcp://", "tls://", etc. +// +// Examples: +// - "tcp://localhost:8080" -> TCP, "localhost:8080" +// - "tls://example.com:443" -> TLS, "example.com:443" +// - "localhost:8080" -> TCP, "localhost:8080" +func parseAddr(addr string) (ConnType, string, error) { + u, err := url.Parse(addr) + if err != nil { + return TCP, "", err + } + + var conn ConnType + + switch u.Scheme { + case "", "tcp", "tcp4", "tcp6": + conn = TCP + case "tls": + conn = TLS + case "quic": + conn = QUIC + default: + return conn, addr, ErrAddressInvalid + } + + u.Scheme = "" + address := strings.TrimPrefix(u.String(), "//") + + return conn, address, nil +} diff --git a/protocol/net_test.go b/network/net_test.go similarity index 72% rename from protocol/net_test.go rename to network/net_test.go index 2584b70..f22aaa5 100644 --- a/protocol/net_test.go +++ b/network/net_test.go @@ -1,4 +1,4 @@ -package protocol +package network import ( "context" @@ -10,6 +10,7 @@ import ( "testing" "time" + "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/utils" "github.com/stretchr/testify/assert" ) @@ -77,44 +78,44 @@ func TestTCPDepot_Connect(t *testing.T) { log := utils.NewDefaultLogger(slog.LevelDebug) - lCon := utils.NewFDQueue[Records](1000, time.Minute, 1) - l := NewNet(log, func(_ string) FeedDrainCloserTraced { - return &TracedQueue[Records]{lCon} - }, func(_ string, t Traced) { lCon.Close() }, &NetTlsConfigOpt{tlsConfig("a.chotki.local")}, &NetWriteTimeoutOpt{Timeout: 1 * time.Minute}) + lCon := utils.NewFDQueue[protocol.Records](1000, time.Minute, 1) + l := NewNet(log, func(_ string) protocol.FeedDrainCloserTraced { + return &TracedQueue[protocol.Records]{lCon} + }, func(_ string, t protocol.Traced) { lCon.Close() }, &NetTlsConfigOpt{tlsConfig("a.chotki.local")}, &NetWriteTimeoutOpt{Timeout: 1 * time.Minute}) err := l.Listen(loop) assert.Nil(t, err) - cCon := utils.NewFDQueue[Records](1000, time.Minute, 1) - c := NewNet(log, func(_ string) FeedDrainCloserTraced { - return &TracedQueue[Records]{cCon} - }, func(_ string, t Traced) { cCon.Close() }, &NetTlsConfigOpt{tlsConfig("b.chotki.local")}, &NetWriteTimeoutOpt{Timeout: 1 * time.Minute}) + cCon := utils.NewFDQueue[protocol.Records](1000, time.Minute, 1) + c := NewNet(log, func(_ string) protocol.FeedDrainCloserTraced { + return &TracedQueue[protocol.Records]{cCon} + }, func(_ string, t protocol.Traced) { cCon.Close() }, &NetTlsConfigOpt{tlsConfig("b.chotki.local")}, &NetWriteTimeoutOpt{Timeout: 1 * time.Minute}) err = c.Connect(loop) assert.Nil(t, err) // send a record - err = cCon.Drain(context.Background(), Records{Record('M', []byte("Hi there"))}) + err = cCon.Drain(context.Background(), protocol.Records{protocol.Record('M', []byte("Hi there"))}) assert.Nil(t, err) rec, err := lCon.Feed(context.Background()) assert.Nil(t, err) assert.Greater(t, len(rec), 0) - lit, body, rest := TakeAny(rec[0]) + lit, body, rest := protocol.TakeAny(rec[0]) assert.Equal(t, uint8('M'), lit) assert.Equal(t, "Hi there", string(body)) assert.Equal(t, 0, len(rest)) // respond to that - err = lCon.Drain(context.Background(), Records{Record('M', []byte("Re: Hi there"))}) + err = lCon.Drain(context.Background(), protocol.Records{protocol.Record('M', []byte("Re: Hi there"))}) assert.NoError(t, err) rerec, err := cCon.Feed(context.Background()) assert.NoError(t, err) assert.Greater(t, len(rerec), 0) - relit, rebody, rerest := TakeAny(rerec[0]) + relit, rebody, rerest := protocol.TakeAny(rerec[0]) assert.Equal(t, uint8('M'), relit) assert.Equal(t, "Re: Hi there", string(rebody)) assert.Equal(t, 0, len(rerest)) @@ -132,10 +133,10 @@ func TestTCPDepot_ConnectFailed(t *testing.T) { log := utils.NewDefaultLogger(slog.LevelDebug) - cCon := utils.NewFDQueue[Records](16, time.Millisecond, 0) - c := NewNet(log, func(_ string) FeedDrainCloserTraced { - return &TracedQueue[Records]{cCon} - }, func(_ string, t Traced) { cCon.Close() }, &NetTlsConfigOpt{tlsConfig("b.chotki.local")}) + cCon := utils.NewFDQueue[protocol.Records](16, time.Millisecond, 0) + c := NewNet(log, func(_ string) protocol.FeedDrainCloserTraced { + return &TracedQueue[protocol.Records]{cCon} + }, func(_ string, t protocol.Traced) { cCon.Close() }, &NetTlsConfigOpt{tlsConfig("b.chotki.local")}) err := c.Connect(loop) assert.Nil(t, err) diff --git a/protocol/peer.go b/network/peer.go similarity index 66% rename from protocol/peer.go rename to network/peer.go index 3a225c6..5b25faa 100644 --- a/protocol/peer.go +++ b/network/peer.go @@ -1,4 +1,4 @@ -package protocol +package network import ( "bytes" @@ -12,16 +12,27 @@ import ( "sync/atomic" "time" + "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/utils" ) +// Peer represents a single network connection with bidirectional communication capabilities. +// It manages the lifecycle of a connection, handling read/write operations with +// configurable buffering, batching, and timeout settings. +// +// Key Features: +// - Concurrent read and write operations +// - Configurable buffer sizes and processing thresholds +// - Automatic batching of write operations +// - Graceful connection cleanup +// - Thread-safe state management type Peer struct { closed atomic.Bool wg sync.WaitGroup writeBatchSize *utils.AvgVal conn net.Conn - inout FeedDrainCloserTraced + inout protocol.FeedDrainCloserTraced incomingBuffer atomic.Int32 readAccumtTimeLimit time.Duration bufferMaxSize int @@ -29,6 +40,8 @@ type Peer struct { writeTimeout time.Duration } +// getReadTimeLimit returns the configured read time limit or a default value. +// This determines how long to wait for incoming data before processing the buffer. func (p *Peer) getReadTimeLimit() time.Duration { if p.readAccumtTimeLimit != 0 { return p.readAccumtTimeLimit @@ -36,11 +49,17 @@ func (p *Peer) getReadTimeLimit() time.Duration { return 5 * time.Second } +// keepRead continuously reads data from the network connection and processes it. +// It implements a buffered reading strategy with configurable thresholds: +// - Accumulates data until buffer size reaches bufferMinToProcess +// - Processes data when read time limit is exceeded +// - Handles incomplete protocol packets gracefully +// - Uses goroutines for concurrent processing to avoid blocking reads func (p *Peer) keepRead(ctx context.Context) error { var buf bytes.Buffer ctx, cancel := context.WithCancel(ctx) defer cancel() - readChannel := make(chan Records) + readChannel := make(chan protocol.Records) errChannel := make(chan error) signal := make(chan struct{}) defer close(readChannel) @@ -100,10 +119,10 @@ func (p *Peer) keepRead(ctx context.Context) error { if (timelimit != nil && time.Now().After(*timelimit)) || buf.Len() >= p.bufferMinToProcess || buf.Len() >= p.bufferMaxSize { select { case signal <- struct{}{}: - recs, err := Split(&buf) - if err != nil && !errors.Is(err, ErrIncomplete) { + recs, err := protocol.Split(&buf) + if err != nil && !errors.Is(err, protocol.ErrIncomplete) { return err - } else if errors.Is(err, ErrIncomplete) { + } else if errors.Is(err, protocol.ErrIncomplete) { if buf.Len() >= p.bufferMaxSize { return errors.Join(err, fmt.Errorf("buffer is not enough to read packet")) } @@ -129,6 +148,10 @@ func (p *Peer) GetIncomingPacketBufferSize() int32 { return p.incomingBuffer.Load() } +// keepWrite continuously writes data to the network connection. +// It retrieves data from the protocol layer via the Feed method and +// batches multiple records together for efficient network transmission. +// The method tracks batch sizes for monitoring and optimization purposes. func (p *Peer) keepWrite(ctx context.Context) error { for !p.closed.Load() { select { @@ -162,6 +185,14 @@ func (p *Peer) keepWrite(ctx context.Context) error { return nil } +// Keep manages the main lifecycle of the peer connection, running both +// read and write operations concurrently. It returns three error values: +// - rerr: error from the read operation +// - werr: error from the write operation +// - cerr: error from closing the connection +// +// The method ensures proper cleanup by closing the connection after +// the write operation completes, which will cancel any ongoing read operations. func (p *Peer) Keep(ctx context.Context) (rerr, werr, cerr error) { p.wg.Add(2) // read & write defer p.wg.Add(-2) diff --git a/protocol/testdata/client_ca_cert.pem b/network/testdata/client_ca_cert.pem similarity index 100% rename from protocol/testdata/client_ca_cert.pem rename to network/testdata/client_ca_cert.pem diff --git a/protocol/testdata/client_cert.pem b/network/testdata/client_cert.pem similarity index 100% rename from protocol/testdata/client_cert.pem rename to network/testdata/client_cert.pem diff --git a/protocol/testdata/client_key.pem b/network/testdata/client_key.pem similarity index 100% rename from protocol/testdata/client_key.pem rename to network/testdata/client_key.pem diff --git a/protocol/testdata/generate.sh b/network/testdata/generate.sh similarity index 100% rename from protocol/testdata/generate.sh rename to network/testdata/generate.sh diff --git a/protocol/testdata/openssl.cnf b/network/testdata/openssl.cnf similarity index 100% rename from protocol/testdata/openssl.cnf rename to network/testdata/openssl.cnf diff --git a/protocol/testdata/server_ca_cert.pem b/network/testdata/server_ca_cert.pem similarity index 100% rename from protocol/testdata/server_ca_cert.pem rename to network/testdata/server_ca_cert.pem diff --git a/protocol/testdata/server_cert.pem b/network/testdata/server_cert.pem similarity index 100% rename from protocol/testdata/server_cert.pem rename to network/testdata/server_cert.pem diff --git a/protocol/testdata/server_key.pem b/network/testdata/server_key.pem similarity index 100% rename from protocol/testdata/server_key.pem rename to network/testdata/server_key.pem diff --git a/objects.go b/objects.go index e4652c2..de66caf 100644 --- a/objects.go +++ b/objects.go @@ -2,121 +2,20 @@ package chotki import ( "context" - "encoding/binary" "fmt" - "unicode/utf8" "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki/chotki_errors" + "github.com/drpcorg/chotki/classes" + "github.com/drpcorg/chotki/host" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" "github.com/pkg/errors" ) -func OKey(id rdx.ID, rdt byte) (key []byte) { - var ret = [18]byte{'O'} - key = binary.BigEndian.AppendUint64(ret[:1], id.Src()) - key = binary.BigEndian.AppendUint64(key, id.Pro()) - key = append(key, rdt) - return -} - -const LidLKeyLen = 1 + 16 + 1 - -func OKeyIdRdt(key []byte) (id rdx.ID, rdt byte) { - if len(key) != LidLKeyLen { - return rdx.BadId, 0 - } - - id = rdx.IDFromBytes(key[1 : LidLKeyLen-1]) - rdt = key[LidLKeyLen-1] - return -} - -var VKey0 = []byte{'V', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'V'} - -func VKey(id rdx.ID) (key []byte) { - var ret = [18]byte{'V'} - block := id.ProOr(SyncBlockMask) - key = binary.BigEndian.AppendUint64(ret[:1], block.Src()) - key = binary.BigEndian.AppendUint64(key, block.Pro()) - key = append(key, 'V') - return -} - -func VKeyId(key []byte) rdx.ID { - if len(key) != LidLKeyLen { - return rdx.BadId - } - return rdx.IDFromBytes(key[1:]).ProAnd(^SyncBlockMask) -} - -// A class contains a number of fields. Each Field has -// some RDT type. A class can inherit another class. -// New fields can be appended to a class, but never removed. -// Max number of fields is 128, max inheritance depth 32. -// When stored, a class is an append-only sequence of Ts. -// The syntax for each T: "XName", where X is the RDT. -// For the map types, can use "MSS_Name" or similar. -// Each field has an Offset. The Offset+RdxType pair is the -// *actual key* for the field in the database. -// Entries having identical Offset+RdxType are considered *renames*! -type Field struct { - Offset int64 - Name string - RdxType byte - RdxTypeExt []byte - Index IndexType -} - -// Fields -type Fields []Field - -func (f Field) Valid() bool { - for _, l := range f.Name { // has unsafe chars - if l < ' ' { - return false - } - } - - return (f.RdxType >= 'A' && f.RdxType <= 'Z' && - len(f.Name) > 0 && utf8.ValidString(f.Name)) -} - -func (fs Fields) MaxOffset() (off int64) { - for _, f := range fs { - if f.Offset > off { - off = f.Offset - } - } - return -} - -func (f Fields) FindRdtOff(rdx byte, off int64) int { - for i := 0; i < len(f); i++ { - if f[i].RdxType == rdx && f[i].Offset == off { - return i - } - } - return -1 -} - -func (f Fields) FindName(name string) (ndx int) { // fixme double naming? - for i := 0; i < len(f); i++ { - if f[i].Name == name { - return i - } - } - return -1 -} - -func ObjectKeyRange(oid rdx.ID) (fro, til []byte) { - oid = oid.ZeroOff() - return OKey(oid, 'O'), OKey(oid.IncPro(1), 0) -} - // returns nil for "not found" func (cho *Chotki) ObjectIterator(oid rdx.ID, snap *pebble.Snapshot) *pebble.Iterator { - fro, til := ObjectKeyRange(oid) + fro, til := host.ObjectKeyRange(oid) io := pebble.IterOptions{ LowerBound: fro, UpperBound: til, @@ -129,7 +28,7 @@ func (cho *Chotki) ObjectIterator(oid rdx.ID, snap *pebble.Snapshot) *pebble.Ite } if it.SeekGE(fro) { // fixme - id, rdt := OKeyIdRdt(it.Key()) + id, rdt := host.OKeyIdRdt(it.Key()) if rdt == 'O' && id == oid { // An iterator is returned from a function, it cannot be closed return it @@ -141,84 +40,29 @@ func (cho *Chotki) ObjectIterator(oid rdx.ID, snap *pebble.Snapshot) *pebble.Ite return nil } -func parseClass(tlv []byte) (fields Fields) { - it := rdx.FIRSTIterator{TLV: tlv} - fields = append(fields, Field{ // todo inheritance - Offset: 0, - Name: "_ref", - RdxType: rdx.Reference, - }) - for it.Next() { - lit, t, name := it.ParsedValue() - if lit != rdx.Term || len(name) == 0 { - break // todo unique names etc - } - rdt := rdx.String - index := IndexType(0) - if name[0] >= 'A' && name[0] <= 'Z' { - rdt = name[0] - index = IndexType(name[1]) - name = name[2:] - } - fields = append(fields, Field{ - Offset: t.Rev, - RdxType: rdt, - Name: string(name), - Index: index, - }) - } - return -} - // todo note that the class may change as the program runs; in such a case // if the class fields are already cached, the current session will not // understand the new fields! -func (cho *Chotki) ClassFields(cid rdx.ID) (fields Fields, err error) { +func (cho *Chotki) ClassFields(cid rdx.ID) (fields classes.Fields, err error) { if fields, ok := cho.types.Load(cid); ok { return fields, nil } - okey := OKey(cid, 'C') + okey := host.OKey(cid, 'C') tlv, clo, e := cho.db.Get(okey) if e != nil { - return nil, ErrTypeUnknown + return nil, chotki_errors.ErrTypeUnknown } - fields = parseClass(tlv) + fields = classes.ParseClass(tlv) _ = clo.Close() cho.types.Store(cid, fields) return } -func (cho *Chotki) ObjectFieldsByClass(oid rdx.ID, form []string) (tid rdx.ID, tlvs protocol.Records, err error) { - it := cho.ObjectIterator(oid, nil) - if it == nil { - return rdx.BadId, nil, ErrObjectUnknown - } - defer it.Close() - - tid = rdx.IDFromZipBytes(it.Value()) - for it.Next() { - id, rdt := OKeyIdRdt(it.Key()) - off := int(id.Off()) - if off == 0 || off > len(form) { - continue - } - decl := form[off-1] - if len(decl) == 0 || rdt != decl[0] { - continue - } - for off > len(tlvs)+1 { - tlvs = append(tlvs, nil) - } - tlvs = append(tlvs, it.Value()) - } - return -} - -func (cho *Chotki) ObjectFields(oid rdx.ID) (tid rdx.ID, decl Fields, fact protocol.Records, err error) { +func (cho *Chotki) ObjectFields(oid rdx.ID) (tid rdx.ID, decl classes.Fields, fact protocol.Records, err error) { it := cho.ObjectIterator(oid, nil) if it == nil { - err = ErrObjectUnknown + err = chotki_errors.ErrObjectUnknown return } defer it.Close() @@ -231,7 +75,7 @@ func (cho *Chotki) ObjectFields(oid rdx.ID) (tid rdx.ID, decl Fields, fact proto } fact = append(fact, it.Value()) for it.Next() { - id, rdt := OKeyIdRdt(it.Key()) + id, rdt := host.OKeyIdRdt(it.Key()) off := int64(id.Off()) ndx := decl.FindRdtOff(rdt, off) if ndx == -1 { @@ -242,38 +86,22 @@ func (cho *Chotki) ObjectFields(oid rdx.ID) (tid rdx.ID, decl Fields, fact proto return } -func (cho *Chotki) ObjectFieldsTLV(oid rdx.ID) (tid rdx.ID, tlv protocol.Records, err error) { - it := cho.ObjectIterator(oid, nil) - if it == nil { - return rdx.BadId, nil, ErrObjectUnknown - } - defer it.Close() - - tid = rdx.IDFromZipBytes(it.Value()) - for it.Next() { - cp := make([]byte, len(it.Value())) - copy(cp, it.Value()) - tlv = append(tlv, cp) - } - return -} - // ObjectFieldTLV picks one field fast. No class checks, etc. func (cho *Chotki) ObjectFieldTLV(fid rdx.ID) (rdt byte, tlv []byte, err error) { db := cho.db if db == nil { - return 0, nil, ErrClosed + return 0, nil, chotki_errors.ErrClosed } it := cho.db.NewIter(&pebble.IterOptions{}) defer it.Close() - key := OKey(fid, 0) + key := host.OKey(fid, 0) if !it.SeekGE(key) { return 0, nil, pebble.ErrNotFound } var fidfact rdx.ID - fidfact, rdt = OKeyIdRdt(it.Key()) + fidfact, rdt = host.OKeyIdRdt(it.Key()) if fidfact != fid { return 0, nil, pebble.ErrNotFound } @@ -281,44 +109,7 @@ func (cho *Chotki) ObjectFieldTLV(fid rdx.ID) (rdt byte, tlv []byte, err error) return } -// ObjectFieldTLV picks one field given its id and RDT. -func (cho *Chotki) ObjectRDTFieldTLV(fid rdx.ID, rdt byte) (tlv []byte, err error) { - db := cho.db - if db == nil { - return nil, ErrClosed - } - - it := cho.db.NewIter(&pebble.IterOptions{}) - defer it.Close() - - key := OKey(fid, rdt) - if !it.SeekGE(key) { - return nil, pebble.ErrNotFound - } - fidfact, rdtfact := OKeyIdRdt(it.Key()) - if fidfact != fid || rdtfact != rdt { - return nil, pebble.ErrNotFound - } - tlv = it.Value() - return -} - -func (cho *Chotki) ObjectVVField(fid rdx.ID) (vv rdx.VV, err error) { - var rdt byte - var tlv []byte - rdt, tlv, err = cho.ObjectFieldTLV(fid) - if err != nil { - return - } - if rdt != rdx.VVector { - return nil, ErrWrongFieldType - } - vv = make(rdx.VV) - err = vv.PutTLV(tlv) - return -} - -func (cho *Chotki) NewClass(ctx context.Context, parent rdx.ID, fields ...Field) (id rdx.ID, err error) { +func (cho *Chotki) NewClass(ctx context.Context, parent rdx.ID, fields ...classes.Field) (id rdx.ID, err error) { var fspecs protocol.Records maxidx := int64(-1) for _, field := range fields { @@ -331,11 +122,11 @@ func (cho *Chotki) NewClass(ctx context.Context, parent rdx.ID, fields ...Field) if !field.Valid() { return rdx.BadId, ErrBadTypeDescription } - if field.Index == FullscanIndex { - return rdx.BadId, ErrFullscanIndexField + if field.Index == classes.FullscanIndex { + return rdx.BadId, chotki_errors.ErrFullscanIndexField } - if field.Index == HashIndex && !rdx.IsFirst(field.RdxType) { - return rdx.BadId, ErrHashIndexFieldNotFirst + if field.Index == classes.HashIndex && !rdx.IsFirst(field.RdxType) { + return rdx.BadId, chotki_errors.ErrHashIndexFieldNotFirst } name := append([]byte{}, field.RdxType) name = append(name, byte(field.Index)) @@ -346,10 +137,10 @@ func (cho *Chotki) NewClass(ctx context.Context, parent rdx.ID, fields ...Field) } func (cho *Chotki) GetClassTLV(ctx context.Context, cid rdx.ID) ([]byte, error) { - okey := OKey(cid, 'C') + okey := host.OKey(cid, 'C') tlv, clo, e := cho.db.Get(okey) if e != nil { - return nil, ErrTypeUnknown + return nil, chotki_errors.ErrTypeUnknown } err := clo.Close() if err != nil { @@ -358,73 +149,10 @@ func (cho *Chotki) GetClassTLV(ctx context.Context, cid rdx.ID) ([]byte, error) return tlv, nil } -// Creates a new object from enveloped TLV fields; no class checks. func (cho *Chotki) NewObjectTLV(ctx context.Context, tid rdx.ID, fields protocol.Records) (id rdx.ID, err error) { return cho.CommitPacket(ctx, 'O', tid, fields) } -func (cho *Chotki) NewObject(ctx context.Context, tid rdx.ID, fields ...string) (id rdx.ID, err error) { - var form Fields - form, err = cho.ClassFields(tid) - if err != nil { - return - } - if len(fields) > len(form) { - return rdx.BadId, ErrUnknownFieldInAType - } - var packet protocol.Records - for i := 0; i < len(fields); i++ { - rdt := form[i+1].RdxType - tlv := rdx.Xparse(rdt, fields[i]) - if tlv == nil { - return rdx.BadId, rdx.ErrBadValueForAType - } - packet = append(packet, protocol.Record(rdt, tlv)) - } - return cho.NewObjectTLV(ctx, tid, packet) -} - -// Deprecated: does not handle non-trivial cases -func (cho *Chotki) EditObject(ctx context.Context, oid rdx.ID, fields ...string) (id rdx.ID, err error) { - formula, err := cho.ClassFields(oid) - if err != nil { - return rdx.BadId, err - } - if len(fields) > len(formula) { - return rdx.BadId, ErrUnknownFieldInAType - } - _, obj, err := cho.ObjectFieldsTLV(oid) - if err != nil { - return rdx.BadId, err - } - // fetch type desc - var packet protocol.Records - for i := 0; i < len(fields); i++ { - rdt := byte(formula[i].RdxType) - tlv := rdx.X2string(rdt, obj[i], fields[i], cho.src) - if tlv == nil { - return rdx.BadId, rdx.ErrBadValueForAType - } - packet = append(packet, protocol.Record('F', rdx.ZipUint64(uint64(i)))) - packet = append(packet, protocol.Record(rdt, tlv)) - } - return cho.CommitPacket(ctx, 'E', oid, packet) -} - -/*func (cho *Chotki) GetObject(oid rdx.ID) (tid rdx.ID, fields []string, err error) { - i := cho.ObjectIterator(oid) - if i == nil || !i.Valid() { - return rdx.BadId, nil, ErrObjectUnknown - } - tid = rdx.IDFromZipBytes(i.Value()) - for i.Next() { - _, rdt := OKeyIdRdt(i.Key()) - str := rdx.Xstring(rdt, i.Value()) - fields = append(fields, str) - } - return -}*/ - func (cho *Chotki) ObjectString(oid rdx.ID) (txt string, err error) { _, form, fact, e := cho.ObjectFields(oid) if e != nil { @@ -473,12 +201,6 @@ func (cho *Chotki) EditObjectRDX(ctx context.Context, oid rdx.ID, pairs []rdx.RD return cho.CommitPacket(ctx, 'E', oid, tlvs) } -func (cho *Chotki) SetFieldTLV(ctx context.Context, fid rdx.ID, tlve []byte) (id rdx.ID, err error) { - oid := fid.ZeroOff() - f := protocol.Record('F', rdx.ZipUint64(uint64(fid.Off()))) - return cho.CommitPacket(ctx, 'E', oid, protocol.Records{f, tlve}) -} - var ErrWrongFieldType = errors.New("wrong field type") func (cho *Chotki) AddToNField(ctx context.Context, fid rdx.ID, count uint64) (id rdx.ID, err error) { @@ -512,100 +234,15 @@ func (cho *Chotki) MapTRField(fid rdx.ID) (themap rdx.MapTR, err error) { return } -func (cho *Chotki) MapSSField(fid rdx.ID) (themap rdx.MapSS, err error) { - rdt, tlv, e := cho.ObjectFieldTLV(fid) - if e != nil { - return nil, e - } - if rdt != rdx.Mapping { - return nil, ErrWrongFieldType - } - themap = rdx.MnativeSS(tlv) - return -} - -// Adds/removes elements to/from a map (removed should map to nil) -func (cho *Chotki) AddToMapTRField(ctx context.Context, fid rdx.ID, changes rdx.MapTR) (id rdx.ID, err error) { - rdt, tlv := cho.GetFieldTLV(fid) // todo error? - if rdt != rdx.Mapping { - return rdx.BadId, ErrWrongFieldType - } - newtlv := rdx.MtlvTR(changes) - dtlv := rdx.Mdelta2(tlv, newtlv) - if len(dtlv) == 0 { - return rdx.ID0, nil - } - packet := protocol.Records{ - protocol.Record('F', rdx.ZipUint64(fid.Off())), - protocol.Record(rdx.Mapping, dtlv), - } - id, err = cho.CommitPacket(ctx, 'E', fid.ZeroOff(), packet) - return -} - -func (cho *Chotki) SetMapTRField(ctx context.Context, fid rdx.ID, changes rdx.MapTR) (id rdx.ID, err error) { - rdt, tlv := cho.GetFieldTLV(fid) // todo error? - if rdt != rdx.Mapping { - return rdx.BadId, ErrWrongFieldType - } - newtlv := rdx.MtlvTR(changes) - dtlv := rdx.Mdelta(tlv, newtlv) - if len(dtlv) == 0 { - return rdx.ID0, nil - } - packet := protocol.Records{ - protocol.Record('F', rdx.ZipUint64(fid.Off())), - protocol.Record(rdx.Mapping, dtlv), - } - id, err = cho.CommitPacket(ctx, 'E', fid.ZeroOff(), packet) - return -} - -func (cho *Chotki) AddToMapSSField(ctx context.Context, fid rdx.ID, changes rdx.MapSS) (id rdx.ID, err error) { - rdt, tlv := cho.GetFieldTLV(fid) // todo error? - if rdt != rdx.Mapping { - return rdx.BadId, ErrWrongFieldType - } - newtlv := rdx.MtlvSS(changes) - dtlv := rdx.Mdelta2(tlv, newtlv) - if len(dtlv) == 0 { - return rdx.ID0, nil - } - packet := protocol.Records{ - protocol.Record('F', rdx.ZipUint64(fid.Off())), - protocol.Record(rdx.Mapping, dtlv), - } - id, err = cho.CommitPacket(ctx, 'E', fid.ZeroOff(), packet) - return -} - -func (cho *Chotki) SetMapSSField(ctx context.Context, fid rdx.ID, changes rdx.MapSS) (id rdx.ID, err error) { - rdt, tlv := cho.GetFieldTLV(fid) // todo error? - if rdt != rdx.Mapping { - return rdx.BadId, ErrWrongFieldType - } - newtlv := rdx.MtlvSS(changes) - dtlv := rdx.Mdelta(tlv, newtlv) - if len(dtlv) == 0 { - return rdx.ID0, nil - } - packet := protocol.Records{ - protocol.Record('F', rdx.ZipUint64(fid.Off())), - protocol.Record(rdx.Mapping, dtlv), - } - id, err = cho.CommitPacket(ctx, 'E', fid.ZeroOff(), packet) - return -} - func (cho *Chotki) GetFieldTLV(id rdx.ID) (rdt byte, tlv []byte) { - key := OKey(id, 'A') + key := host.OKey(id, 'A') it := cho.db.NewIter(&pebble.IterOptions{ LowerBound: []byte{'O'}, UpperBound: []byte{'P'}, }) defer it.Close() if it.SeekGE(key) { - fact, r := OKeyIdRdt(it.Key()) + fact, r := host.OKeyIdRdt(it.Key()) if fact == id { tlv = it.Value() rdt = r @@ -614,12 +251,6 @@ func (cho *Chotki) GetFieldTLV(id rdx.ID) (rdt byte, tlv []byte) { return } -func EditTLV(off uint64, rdt byte, tlv []byte) (edit []byte) { - edit = append(edit, protocol.TinyRecord('F', rdx.ZipUint64(off))...) - edit = append(edit, protocol.Record(rdt, tlv)...) - return -} - func (cho *Chotki) EditFieldTLV(ctx context.Context, fid rdx.ID, delta []byte) (id rdx.ID, err error) { tlvs := protocol.Records{} tlvs = append(tlvs, protocol.TinyRecord('F', rdx.ZipUint64(fid.Off()))) diff --git a/objects_test.go b/objects_test.go deleted file mode 100644 index 02e1272..0000000 --- a/objects_test.go +++ /dev/null @@ -1,141 +0,0 @@ -package chotki - -import ( - "context" - "os" - "testing" - - "github.com/drpcorg/chotki/protocol" - "github.com/drpcorg/chotki/rdx" - "github.com/stretchr/testify/assert" -) - -func TestChotkiMapTRField(t *testing.T) { - _ = os.RemoveAll("cho1") - defer os.RemoveAll("cho1") - - a, err := Open("cho1", Options{Src: 0x1, Name: "test replica 1"}) - assert.Nil(t, err) - - oid, err := a.NewObjectTLV(context.Background(), rdx.ID0.ProPlus(1), protocol.Records{ - protocol.Record('M'), - }) - assert.Nil(t, err) - fid := oid.ToOff(1) - - tr := rdx.MapTR{ - "Name0": rdx.ID0.ProPlus(100), - "Name1": rdx.ID0.ProPlus(1), - "Name3": rdx.ID0.ProPlus(3), - } - id1, err := a.AddToMapTRField(context.Background(), fid, tr) - assert.Nil(t, err) - dtr := rdx.MapTR{ - "Name1": rdx.ID0.ProPlus(1), - "Name2": rdx.ID0.ProPlus(2), - "Name3": rdx.ID0, - } - id2, err := a.AddToMapTRField(context.Background(), fid, dtr) - assert.Nil(t, err) - assert.NotEqual(t, id1, id2) - - correct := rdx.MapTR{ - "Name0": rdx.ID0.ProPlus(100), - "Name1": rdx.ID0.ProPlus(1), - "Name2": rdx.ID0.ProPlus(2), - } - merged, err := a.MapTRField(fid) - assert.Nil(t, err) - assert.Equal(t, correct, merged) - - id3, err := a.SetMapTRField(context.Background(), fid, correct) - assert.Nil(t, err) - assert.Equal(t, rdx.ID0, id3) - - _ = a.Close() -} - -func TestChotkiMapSSField(t *testing.T) { - _ = os.RemoveAll("cho2") - defer os.RemoveAll("cho2") - - a, err := Open("cho2", Options{Src: 0x2, Name: "test replica 2"}) - assert.Nil(t, err) - - oid, err := a.NewObjectTLV(context.Background(), rdx.ID0.ProPlus(1), protocol.Records{ - protocol.Record('M'), - }) - assert.Nil(t, err) - fid := oid.ToOff(1) - - tr := rdx.MapSS{ - "Name0": "Value0", - "Name1": "Value1", - "Name3": "Value3", - } - id1, err := a.AddToMapSSField(context.Background(), fid, tr) - assert.Nil(t, err) - dtr := rdx.MapSS{ - "Name1": "Value1", - "Name2": "Value2", - "Name3": "", - } - id2, err := a.AddToMapSSField(context.Background(), fid, dtr) - assert.Nil(t, err) - assert.NotEqual(t, id1, id2) - - correct := rdx.MapSS{ - "Name0": "Value0", - "Name1": "Value1", - "Name2": "Value2", - } - merged, err := a.MapSSField(fid) - assert.Nil(t, err) - assert.Equal(t, correct, merged) - - id3, err := a.SetMapSSField(context.Background(), fid, correct) - assert.Nil(t, err) - assert.Equal(t, rdx.ID0, id3) - - _ = a.Close() -} - -func TestChotki_SetMapSSField(t *testing.T) { - _ = os.RemoveAll("cho3") - defer os.RemoveAll("cho3") - - a, err := Open("cho3", Options{Src: 0x3, Name: "test replica 3"}) - assert.Nil(t, err) - - oid, err := a.NewObjectTLV(context.Background(), rdx.ID0.ProPlus(1), protocol.Records{ - protocol.Record('M'), - }) - assert.Nil(t, err) - fid := oid.ToOff(1) - - state1 := rdx.MapSS{ - "A": "1", - "B": "2", - "C": "3", - } - - id1, err := a.SetMapSSField(context.Background(), fid, state1) - assert.Nil(t, err) - assert.NotEqual(t, rdx.ID0, id1) - - state2 := rdx.MapSS{ - "A": "1", - "B": "22", - "D": "4", - } - - id2, err := a.SetMapSSField(context.Background(), fid, state2) - assert.Nil(t, err) - assert.NotEqual(t, rdx.ID0, id2) - - result, err := a.MapSSField(fid) - assert.Nil(t, err) - assert.Equal(t, state2, result) - - _ = a.Close() -} diff --git a/orm.go b/orm.go index 706cc35..b2d637f 100644 --- a/orm.go +++ b/orm.go @@ -1,15 +1,15 @@ package chotki import ( - "bytes" "context" "iter" "reflect" "slices" "sync" - "text/template" "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki/chotki_errors" + "github.com/drpcorg/chotki/host" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" ) @@ -69,11 +69,11 @@ func (orm *ORM) Save(ctx context.Context, objs ...NativeObject) (err error) { for _, obj := range objs { id := orm.FindID(obj) if id == rdx.BadId { - return ErrObjectUnknown + return chotki_errors.ErrObjectUnknown } it := orm.Host.ObjectIterator(id, orm.Snap) if it == nil { - err = ErrObjectUnknown + err = chotki_errors.ErrObjectUnknown break } cid := rdx.IDFromZipBytes(it.Value()) @@ -85,7 +85,7 @@ func (orm *ORM) Save(ctx context.Context, objs ...NativeObject) (err error) { var changes protocol.Records flags := [64]bool{} for it.Next() { - lid, rdt := OKeyIdRdt(it.Key()) + lid, rdt := host.OKeyIdRdt(it.Key()) off := lid.Off() change, e := obj.Store(off, rdt, it.Value(), orm.Host.Clock()) flags[off] = true @@ -132,7 +132,7 @@ func (orm *ORM) Clear() error { defer orm.lock.Unlock() if orm.Host == nil { - return ErrClosed + return chotki_errors.ErrClosed } orm.objects.Clear() if orm.Snap != nil { @@ -147,7 +147,7 @@ func (orm *ORM) Close() error { defer orm.lock.Unlock() if orm.Host == nil { - return ErrClosed + return chotki_errors.ErrClosed } orm.objects.Clear() orm.ids = sync.Map{} @@ -160,15 +160,15 @@ func (orm *ORM) Close() error { func (orm *ORM) UpdateObject(obj NativeObject, snap *pebble.Snapshot) error { id := orm.FindID(obj) if id == rdx.BadId { - return ErrObjectUnknown + return chotki_errors.ErrObjectUnknown } it := orm.Host.ObjectIterator(id, snap) if it == nil { - return ErrObjectUnknown + return chotki_errors.ErrObjectUnknown } seq := orm.Snap.Seq() for it.Next() { - lid, rdt := OKeyIdRdt(it.Key()) + lid, rdt := host.OKeyIdRdt(it.Key()) off := lid.Off() if it.Seq() > seq { e := obj.Load(off, rdt, it.Value()) @@ -215,14 +215,14 @@ func (orm *ORM) Load(id rdx.ID, blanc NativeObject, skipFields ...uint64) (obj N if ok { return pre.(NativeObject), nil } - fro, til := ObjectKeyRange(id) + fro, til := host.ObjectKeyRange(id) io := pebble.IterOptions{ LowerBound: fro, UpperBound: til, } it := orm.Snap.NewIter(&io) for it.SeekGE(fro); it.Valid(); it.Next() { - lid, rdt := OKeyIdRdt(it.Key()) + lid, rdt := host.OKeyIdRdt(it.Key()) off := lid.Off() if !slices.Contains(skipFields, off) { e := blanc.Load(off, rdt, it.Value()) @@ -297,99 +297,3 @@ func (orm *ORM) FindID(obj NativeObject) rdx.ID { } return id.(rdx.ID) } - -type templateState struct { - CId rdx.ID - Name string - Fields Fields - Natives map[byte]string -} - -func (orm *ORM) Compile(name string, cid rdx.ID) (code string, err error) { - class, e := template.New("test").Parse(ClassTemplate) - if e != nil { - return "", e - } - state := templateState{ - CId: cid, - Natives: FIRSTnatives, - Name: name, - } - state.Fields, err = orm.Host.ClassFields(cid) - if err != nil { - return - } - buf := bytes.Buffer{} - err = class.Execute(&buf, state) - if err == nil { - code = buf.String() - } - return -} - -var FIRSTnatives = map[byte]string{ - 'F': "float64", - 'I': "int64", - 'R': "rdx.ID", - 'S': "string", - 'T': "string", - 'N': "uint64", - 'Z': "int64", -} - -// todo RDX formula -var ClassTemplate = ` -{{$nat := .Natives}} -type {{ .Name }} struct { - {{ range $n, $f := .Fields }} - {{ if eq $n 0 }} {{continue}} {{end }} - {{ $f.Name }} {{ index $nat $f.RdxType }} - {{ end }} -} - -var {{.Name}}ClassId = rdx.IDFromString("{{.CId.String}}") - -func (o *{{.Name}}) Load(off uint64, rdt byte, tlv []byte) error { - switch (off) { - {{ range $n, $f := .Fields }} - {{ if eq $n 0 }} {{continue}} {{end }} - case {{$n}}: - {{ $rdt := printf "%c" $f.RdxType }} - if rdt != '{{$rdt}}' { break } - o.{{$f.Name}} = rdx.{{$rdt}}native(tlv) - {{ end }} - default: return chotki.ErrUnknownFieldInAType - } - return nil -} - -func (o *{{.Name}}) Store(off uint64, rdt byte, old []byte, clock rdx.Clock) (bare []byte, err error) { - switch (off) { - {{ range $n, $f := .Fields }} - {{ if eq $n 0 }} {{continue}} {{end }} - case {{$n}}: - {{ $rdt := printf "%c" $f.RdxType }} - if rdt != '{{$rdt}}' { break } - if old == nil { - bare = rdx.{{$rdt}}tlv(o.{{$f.Name}}) - } else { - bare = rdx.{{$rdt}}delta(old, o.{{$f.Name}}, clock) - } - {{ end }} - default: return nil, chotki.ErrUnknownFieldInAType - } - if bare==nil { - err = rdx.ErrBadValueForAType - } - return -} -` - -// todo collection description -var ETemplate = ` -func (o *{{Name}}) Get{{- Name}}() { -} - -func (o *{{Name}}) Put{{- Name}}() { -} -` diff --git a/packets.go b/packets.go index 1b7d5da..69d7f09 100644 --- a/packets.go +++ b/packets.go @@ -4,15 +4,17 @@ import ( "errors" "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki/host" + "github.com/drpcorg/chotki/indexes" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" ) func (cho *Chotki) UpdateVTree(id, ref rdx.ID, pb *pebble.Batch) (err error) { v := protocol.Record('V', id.ZipBytes()) - err = pb.Merge(VKey(ref), v, cho.opts.PebbleWriteOptions) + err = pb.Merge(host.VKey(ref), v, cho.opts.PebbleWriteOptions) if err == nil { - err = pb.Merge(VKey0, v, cho.opts.PebbleWriteOptions) + err = pb.Merge(host.VKey0, v, cho.opts.PebbleWriteOptions) } return } @@ -30,10 +32,10 @@ func (cho *Chotki) ApplyD(id, ref rdx.ID, body []byte, batch *pebble.Batch) (err if rdt == 'C' { cho.types.Clear() } - err = batch.Merge(OKey(at, rdt), bare, cho.opts.PebbleWriteOptions) + err = batch.Merge(host.OKey(at, rdt), bare, cho.opts.PebbleWriteOptions) if err == nil && rdt == 'O' { cid := rdx.IDFromZipBytes(bare) - err = cho.indexManager.addFullScanIndex(cid, at, batch) + err = cho.indexManager.AddFullScanIndex(cid, at, batch) } else { err = cho.indexManager.OnFieldUpdate(rdt, at, rdx.BadId, bare, batch) } @@ -45,7 +47,7 @@ func (cho *Chotki) ApplyH(id, ref rdx.ID, body []byte, batch *pebble.Batch) (err _, rest := protocol.Take('M', body) var vbody []byte vbody, _ = protocol.Take('V', rest) - err = batch.Merge(VKey0, vbody, cho.opts.PebbleWriteOptions) + err = batch.Merge(host.VKey0, vbody, cho.opts.PebbleWriteOptions) return } @@ -56,7 +58,7 @@ func (cho *Chotki) ApplyV(id, ref rdx.ID, body []byte, batch *pebble.Batch) (err rec, rest = protocol.Take('V', rest) idb, rec = protocol.Take('R', rec) id := rdx.IDFromZipBytes(idb) - key := VKey(id) + key := host.VKey(id) if !rdx.VValid(rec) { err = ErrBadVPacket } else { @@ -76,14 +78,14 @@ func (cho *Chotki) ApplyC(id, ref rdx.ID, body []byte, batch *pebble.Batch, call cid = ref } err = batch.Merge( - OKey(cid, 'C'), + host.OKey(cid, 'C'), body, cho.opts.PebbleWriteOptions) if err == nil { err = cho.UpdateVTree(id, cid, batch) } if err == nil { - var tasks []reindexTask + var tasks []indexes.ReindexTask tasks, err = cho.indexManager.HandleClassUpdate(id, cid, body) if err == nil { for _, task := range tasks { @@ -99,7 +101,7 @@ func (cho *Chotki) ApplyC(id, ref rdx.ID, body []byte, batch *pebble.Batch, call func (cho *Chotki) ApplyOY(lot byte, id, ref rdx.ID, body []byte, batch *pebble.Batch) (err error) { err = batch.Merge( - OKey(id, lot), + host.OKey(id, lot), ref.ZipBytes(), cho.opts.PebbleWriteOptions) rest := body @@ -116,7 +118,7 @@ func (cho *Chotki) ApplyOY(lot byte, id, ref rdx.ID, body []byte, batch *pebble. } bare = rest[hlen:rlen] fid = id.ToOff(uint64(fno)) - fkey := OKey(fid, lit) + fkey := host.OKey(fid, lit) switch lit { case 'F', 'I', 'R', 'S', 'T': rebar, err = rdx.SetSourceFIRST(bare, id.Src()) @@ -141,7 +143,7 @@ func (cho *Chotki) ApplyOY(lot byte, id, ref rdx.ID, body []byte, batch *pebble. err = cho.UpdateVTree(fid, id, batch) } if err == nil && lot == 'O' { - err = cho.indexManager.addFullScanIndex(ref, id, batch) + err = cho.indexManager.AddFullScanIndex(ref, id, batch) } return } @@ -177,7 +179,7 @@ func (cho *Chotki) ApplyE(id, r rdx.ID, body []byte, batch *pebble.Batch, calls break } fid := r.ToOff(field) - fkey := OKey(fid, lit) + fkey := host.OKey(fid, lit) err = batch.Merge( fkey, rebar, diff --git a/protocol/feeddrainer.go b/protocol/feeddrainer.go index 69ceaf1..4e512c9 100644 --- a/protocol/feeddrainer.go +++ b/protocol/feeddrainer.go @@ -5,6 +5,8 @@ import ( "io" ) +// Feeder is an interface that defines the contract for reading records. +// Implementations should provide a method to feed records from a source. type Feeder interface { // Feed reads and returns records. // The EoF convention follows that of io.Reader: @@ -13,35 +15,50 @@ type Feeder interface { Feed(ctx context.Context) (recs Records, err error) } +// FeedCloser combines the Feeder interface with io.Closer, +// allowing implementations to both feed records and be properly closed. type FeedCloser interface { Feeder io.Closer } +// Drainer is an interface that defines the contract for writing records. +// Implementations should provide a method to drain records to a destination. type Drainer interface { Drain(ctx context.Context, recs Records) error } +// DrainCloser combines the Drainer interface with io.Closer, +// allowing implementations to both drain records and be properly closed. type DrainCloser interface { Drainer io.Closer } +// FeedDrainCloser combines Feeder, Drainer, and io.Closer interfaces, +// providing a complete read-write-close capability. type FeedDrainCloser interface { Feeder Drainer io.Closer } +// Traced is an interface for objects that can provide a trace ID +// for debugging and monitoring purposes. type Traced interface { GetTraceId() string } +// FeedDrainCloserTraced combines FeedDrainCloser with Traced interface, +// providing complete read-write-close capability with tracing support. type FeedDrainCloserTraced interface { FeedDrainCloser Traced } +// Relay performs a single feed-drain operation between a feeder and drainer. +// It reads records from the feeder and writes them to the drainer in one operation. +// Returns an error if either the feed or drain operation fails. func Relay(feeder Feeder, drainer Drainer) error { recs, err := feeder.Feed(context.Background()) if err != nil { @@ -54,6 +71,9 @@ func Relay(feeder Feeder, drainer Drainer) error { return err } +// Pump continuously relays records from feeder to drainer until an error occurs. +// This function runs indefinitely until the feeder returns an error (typically EOF). +// It uses context.Background() for all operations. func Pump(feeder Feeder, drainer Drainer) (err error) { for err == nil { err = Relay(feeder, drainer) @@ -61,6 +81,9 @@ func Pump(feeder Feeder, drainer Drainer) (err error) { return } +// PumpCtx continuously relays records from feeder to drainer until an error occurs +// or the context is cancelled. This function respects context cancellation and +// will stop pumping when the context is done. func PumpCtx(ctx context.Context, feeder Feeder, drainer Drainer) (err error) { for err == nil && ctx.Err() == nil { err = Relay(feeder, drainer) @@ -68,6 +91,10 @@ func PumpCtx(ctx context.Context, feeder Feeder, drainer Drainer) (err error) { return } +// PumpCtxCallback continuously relays records from feeder to drainer until an error occurs, +// the context is cancelled, or the callback function returns false. +// The callback function is called after each relay operation and can be used +// to implement custom stopping conditions. func PumpCtxCallback(ctx context.Context, feeder Feeder, drainer Drainer, f func() bool) (err error) { for err == nil && ctx.Err() == nil { err = Relay(feeder, drainer) @@ -78,6 +105,9 @@ func PumpCtxCallback(ctx context.Context, feeder Feeder, drainer Drainer, f func return } +// PumpN relays records from feeder to drainer exactly n times. +// This function will stop after n successful relay operations, +// regardless of whether more data is available. func PumpN(feeder Feeder, drainer Drainer, n int) (err error) { for err == nil && n > 0 { err = Relay(feeder, drainer) @@ -86,6 +116,10 @@ func PumpN(feeder Feeder, drainer Drainer, n int) (err error) { return } +// PumpThenClose continuously pumps records from feed to drain until an error occurs, +// then properly closes both the feeder and drainer. This function ensures +// that resources are cleaned up even if errors occur during pumping. +// Returns the first error encountered (feed error takes precedence over drain error). func PumpThenClose(feed FeedCloser, drain DrainCloser) error { var ferr, derr error for ferr == nil && derr == nil { diff --git a/protocol/net.go b/protocol/net.go deleted file mode 100644 index 750cadb..0000000 --- a/protocol/net.go +++ /dev/null @@ -1,431 +0,0 @@ -package protocol - -import ( - "context" - "crypto/tls" - "errors" - "fmt" - "net" - "net/url" - "strings" - "sync" - "time" - - "github.com/drpcorg/chotki/utils" - "github.com/google/uuid" - "github.com/puzpuzpuz/xsync/v3" -) - -type ConnType = uint - -const ( - TCP ConnType = iota + 1 - TLS - QUIC -) - -const ( - TYPICAL_MTU = 1500 - MAX_OUT_QUEUE_LEN = 1 << 20 // 16MB of pointers is a lot - - MAX_RETRY_PERIOD = time.Minute - MIN_RETRY_PERIOD = time.Second / 2 -) - -type InstallCallback func(name string) FeedDrainCloserTraced -type DestroyCallback func(name string, p Traced) - -// A TCP/TLS/QUIC server/client for the use case of real-time async communication. -// Differently from the case of request-response (like HTTP), we do not -// wait for a request, then dedicating a thread to processing, then sending -// back the resulting response. Instead, we constantly fan sendQueue tons of -// tiny messages. That dictates different work patterns than your typical -// HTTP/RPC server as, for example, we cannot let one slow receiver delay -// event transmission to all the other receivers. -type Net struct { - wg sync.WaitGroup - log utils.Logger - onInstall InstallCallback - onDestroy DestroyCallback - - conns *xsync.MapOf[string, *Peer] - listens *xsync.MapOf[string, net.Listener] - ctx context.Context - cancelCtx context.CancelFunc - - tlsConfig *tls.Config - readBufferTcpSize int - writeBufferTcpSize int - readAccumTimeLimit time.Duration - writeTimeout time.Duration - bufferMaxSize int - bufferMinToProcess int -} - -type NetOpt interface { - Apply(*Net) -} - -type NetWriteTimeoutOpt struct { - Timeout time.Duration -} - -func (opt *NetWriteTimeoutOpt) Apply(n *Net) { - n.writeTimeout = opt.Timeout -} - -type NetTlsConfigOpt struct { - Config *tls.Config -} - -func (opt *NetTlsConfigOpt) Apply(n *Net) { - n.tlsConfig = opt.Config -} - -type NetReadBatchOpt struct { - ReadAccumTimeLimit time.Duration - BufferMaxSize int - BufferMinToProcess int -} - -func (opt *NetReadBatchOpt) Apply(n *Net) { - n.readAccumTimeLimit = opt.ReadAccumTimeLimit - n.bufferMaxSize = opt.BufferMaxSize - n.bufferMinToProcess = opt.BufferMinToProcess -} - -type TcpBufferSizeOpt struct { - Read int - Write int -} - -func (opt *TcpBufferSizeOpt) Apply(n *Net) { - n.readBufferTcpSize = opt.Read - n.writeBufferTcpSize = opt.Write -} - -func NewNet(log utils.Logger, install InstallCallback, destroy DestroyCallback, opts ...NetOpt) *Net { - ctx, cancel := context.WithCancel(context.Background()) - net := &Net{ - log: log, - cancelCtx: cancel, - ctx: ctx, - conns: xsync.NewMapOf[string, *Peer](), - listens: xsync.NewMapOf[string, net.Listener](), - onInstall: install, - onDestroy: destroy, - } - for _, o := range opts { - o.Apply(net) - } - return net -} - -type NetStats struct { - ReadBuffers map[string]int32 - WriteBatches map[string]int32 -} - -func (n *Net) GetStats() NetStats { - stats := NetStats{ - ReadBuffers: make(map[string]int32), - WriteBatches: make(map[string]int32), - } - n.conns.Range(func(name string, peer *Peer) bool { - if peer != nil { - stats.ReadBuffers[name] = peer.GetIncomingPacketBufferSize() - stats.WriteBatches[name] = int32(peer.writeBatchSize.Val()) - } - return true - }) - return stats -} - -func (n *Net) Close() error { - n.cancelCtx() - - n.listens.Range(func(_ string, v net.Listener) bool { - v.Close() - return true - }) - n.listens.Clear() - - n.conns.Range(func(_ string, p *Peer) bool { - // sometimes it can be nil when we started connecting, but haven't connected yet - if p != nil { - p.Close() - } - return true - }) - n.conns.Clear() - - n.wg.Wait() - return nil -} - -func (n *Net) Connect(addr string) (err error) { - return n.ConnectPool(addr, []string{addr}) -} - -func (n *Net) ConnectPool(name string, addrs []string) (err error) { - // nil is needed so that Connect cannot be called - // while KeepConnecting is connects - if _, ok := n.conns.LoadOrStore(name, nil); ok { - return ErrAddressDuplicated - } - - n.wg.Add(1) - go func() { - n.KeepConnecting(fmt.Sprintf("connect:%s", name), addrs) - n.wg.Done() - }() - - return nil -} - -func (de *Net) Disconnect(name string) (err error) { - conn, ok := de.conns.LoadAndDelete(name) - if !ok { - return ErrAddressUnknown - } - - conn.Close() - return nil -} - -func (n *Net) Listen(addr string) error { - // nil is needed so that Listen cannot be called - // while creating listener - if _, ok := n.listens.LoadOrStore(addr, nil); ok { - return ErrAddressDuplicated - } - - listener, err := n.createListener(addr) - if err != nil { - n.listens.Delete(addr) - return err - } - n.listens.Store(addr, listener) - - n.log.Info("net: listening", "addr", addr) - - n.wg.Add(1) - go func() { - n.KeepListening(addr) - n.wg.Done() - }() - - return nil -} - -func (de *Net) Unlisten(addr string) error { - listener, ok := de.listens.LoadAndDelete(addr) - if !ok { - return ErrAddressUnknown - } - - return listener.Close() -} - -func (n *Net) KeepConnecting(name string, addrs []string) { - connBackoff := MIN_RETRY_PERIOD - for n.ctx.Err() == nil { - var err error - var conn net.Conn - for _, addr := range addrs { - conn, err = n.createConn(addr) - if err == nil { - break - } - } - - if err != nil { - n.log.Error("net: couldn't connect", "name", name, "err", err) - - select { - case <-time.After(connBackoff): - case <-n.ctx.Done(): - break - } - connBackoff = min(MAX_RETRY_PERIOD, connBackoff*2) - - continue - } - n.setTCPBuffersSize(n.log.WithDefaultArgs(context.Background(), "name", name), conn) - n.log.Info("net: connected", "name", name) - - connBackoff = MIN_RETRY_PERIOD - n.keepPeer(name, conn) - } -} - -func (n *Net) setTCPBuffersSize(ctx context.Context, conn net.Conn) { - var tconn *net.TCPConn - switch res := conn.(type) { - case *tls.Conn: - nconn, ok := res.NetConn().(*net.TCPConn) - if !ok { - n.log.WarnCtx(ctx, "net: unable to set buffers, because tls conn is strange") - return - } - tconn = nconn - case *net.TCPConn: - tconn = res - default: - n.log.WarnCtx(ctx, "net: unable to set buffers, because unknown connection type") - return - } - if n.readBufferTcpSize > 0 { - tconn.SetReadBuffer(n.readBufferTcpSize) - } - if n.writeBufferTcpSize > 0 { - tconn.SetWriteBuffer(n.writeBufferTcpSize) - } -} - -func (n *Net) KeepListening(addr string) { - for n.ctx.Err() == nil { - listener, ok := n.listens.Load(addr) - if !ok { - break - } - - conn, err := listener.Accept() - if err != nil { - if errors.Is(err, net.ErrClosed) { - break - } - - // reconnects are the client's problem, just continue - n.log.Error("net: couldn't accept request", "addr", addr, "err", err) - continue - } - - remoteAddr := conn.RemoteAddr().String() - n.log.Info("net: accept connection", "addr", addr, "remoteAddr", remoteAddr) - n.setTCPBuffersSize(n.log.WithDefaultArgs(context.Background(), "addr", addr, "remoteAdds", remoteAddr), conn) - n.wg.Add(1) - go func() { - n.keepPeer(fmt.Sprintf("listen:%s:%s", uuid.Must(uuid.NewV7()).String(), remoteAddr), conn) - defer n.wg.Done() - }() - } - - if l, ok := n.listens.LoadAndDelete(addr); ok { - if err := l.Close(); err != nil && !errors.Is(err, net.ErrClosed) { - n.log.Error("net: couldn't correct close listener", "addr", addr, "err", err) - } - } - - n.log.Info("net: listener closed", "addr", addr) -} - -func (n *Net) keepPeer(name string, conn net.Conn) { - peer := &Peer{ - inout: n.onInstall(name), - conn: conn, - writeTimeout: n.writeTimeout, - readAccumtTimeLimit: n.readAccumTimeLimit, - bufferMaxSize: n.bufferMaxSize, - bufferMinToProcess: n.bufferMinToProcess, - writeBatchSize: &utils.AvgVal{}, - } - n.conns.Store(name, peer) - - readErr, writeErr, closeErr := peer.Keep(n.ctx) - if readErr != nil { - n.log.Error("net: couldn't read from peer", "name", name, "err", readErr, "trace_id", peer.GetTraceId()) - } - if writeErr != nil { - n.log.Error("net: couldn't write to peer", "name", name, "err", writeErr, "trace_id", peer.GetTraceId()) - } - if closeErr != nil { - n.log.Error("net: couldn't correct close peer", "name", name, "err", closeErr, "trace_id", peer.GetTraceId()) - } - - n.conns.Delete(name) - peer.Close() - n.onDestroy(name, peer) -} - -func (n *Net) createListener(addr string) (net.Listener, error) { - connType, address, err := parseAddr(addr) - if err != nil { - return nil, err - } - - var listener net.Listener - switch connType { - case TCP: - config := net.ListenConfig{} - if listener, err = config.Listen(n.ctx, "tcp", address); err != nil { - return nil, err - } - - case TLS: - config := net.ListenConfig{} - if listener, err = config.Listen(n.ctx, "tcp", address); err != nil { - return nil, err - } - - listener = tls.NewListener(listener, n.tlsConfig) - - case QUIC: - return nil, errors.New("QUIC unimplemented") - } - - return listener, nil -} - -func (n *Net) createConn(addr string) (net.Conn, error) { - connType, address, err := parseAddr(addr) - if err != nil { - return nil, err - } - - var conn net.Conn - switch connType { - case TCP: - d := net.Dialer{Timeout: time.Minute} - if conn, err = d.DialContext(n.ctx, "tcp", address); err != nil { - return nil, err - } - - case TLS: - d := tls.Dialer{Config: n.tlsConfig} - - if conn, err = d.DialContext(n.ctx, "tcp", address); err != nil { - return nil, err - } - - case QUIC: - return nil, errors.New("QUIC unimplemented") - } - - return conn, err -} - -func parseAddr(addr string) (ConnType, string, error) { - u, err := url.Parse(addr) - if err != nil { - return TCP, "", err - } - - var conn ConnType - - switch u.Scheme { - case "", "tcp", "tcp4", "tcp6": - conn = TCP - case "tls": - conn = TLS - case "quic": - conn = QUIC - default: - return conn, addr, ErrAddressInvalid - } - - u.Scheme = "" - address := strings.TrimPrefix(u.String(), "//") - - return conn, address, nil -} diff --git a/protocol/records.go b/protocol/records.go index 54fba13..5e650af 100644 --- a/protocol/records.go +++ b/protocol/records.go @@ -7,32 +7,6 @@ package protocol // Records converts easily to net.Buffers. type Records [][]byte -func (recs Records) recrem(total int64) (prelen int, prerem int64) { - for len(recs) > prelen && int64(len(recs[prelen])) <= total { - total -= int64(len(recs[prelen])) - prelen++ - } - prerem = total - return -} - -func (recs Records) WholeRecordPrefix(limit int64) (prefix Records, remainder int64) { - prelen, remainder := recs.recrem(limit) - prefix = recs[:prelen] - return -} - -func (recs Records) ExactSuffix(total int64) (suffix Records) { - prelen, prerem := recs.recrem(total) - suffix = recs[prelen:] - if prerem != 0 { // damages the original, hence copy - edited := make(Records, 1, len(suffix)) - edited[0] = suffix[0][prerem:] - suffix = append(edited, suffix[1:]...) - } - return -} - func (recs Records) TotalLen() (total int64) { for _, r := range recs { total += int64(len(r)) diff --git a/protocol/tlv.go b/protocol/tlv.go index dcbe3f3..99c7429 100644 --- a/protocol/tlv.go +++ b/protocol/tlv.go @@ -1,6 +1,80 @@ // Protocol format is based on ToyTLV (MIT licence) written by Victor Grishchenko in 2024 // Original project: https://github.com/learn-decentralized-systems/toytlv +/* +Package protocol implements a compact TLV (Type-Length-Value) encoding format optimized for efficiency. + +# TLV Record Format + +The protocol supports three encoding formats with automatic format selection based on record size: + + 1. Tiny Format (1 byte header) - for records 0-9 bytes: + [('0' + body_length)] + Example: 3-byte body → ['3'] + - Most compact encoding + - Type information is lost (normalized to '0') + - Only available with lowercase record types + + 2. Short Format (2 bytes header) - for records up to 255 bytes: + [lowercase_type, body_length] + Example: type 'A', 100 bytes → ['a', 100] + - Medium efficiency + - Type preserved in lowercase form + - 1-byte length field + + 3. Long Format (5 bytes header) - for records up to 2GB: + [uppercase_type, length_as_4byte_little_endian] + Example: type 'A', 1000 bytes → ['A', 0xE8, 0x03, 0x00, 0x00] + - Full capacity encoding + - Type preserved in uppercase form + - 4-byte little-endian length field + +# Record Types + +Record types are restricted to uppercase letters A-Z. The case of the type parameter +in encoding functions affects format selection: +- Lowercase ('a'-'z'): enables tiny format optimization for small records +- Uppercase ('A'-'Z'): forces explicit encoding, no tiny format + +# Format Selection Logic + +The encoding format is automatically selected based on: +- Body size (0-9 → tiny, 10-255 → short, >255 → long) +- Type case (lowercase enables tiny, uppercase forces explicit) +- Tiny format requires both: body_size ≤ 9 AND lowercase type + +# Parsing and Safety + +The package provides two levels of parsing functions: +- Safe functions (Take, TakeAny): for trusted data sources, use nil returns for errors +- Wary functions (TakeWary, TakeAnyWary): for untrusted data, return explicit errors + +# Streaming Support + +For large or dynamically-sized records, use the streaming API: + + bookmark, buf := OpenHeader(buf, 'X') // start record with placeholder length + buf = append(buf, data...) // add body data incrementally + CloseHeader(buf, bookmark) // finalize length field + +Example Usage + + // Create a simple record + record := Record('M', []byte("Hello")) + + // Parse records from buffer + data := bytes.NewBuffer(networkData) + records, err := Split(data) + + // Extract specific record type + body, rest := Take('M', records[0]) + +# Performance Considerations + +- Use Concat() instead of Join() for better memory efficiency +- Prefer lowercase types for small frequent records (tiny format) +- Use streaming API for large records to avoid intermediate allocations +*/ package protocol import ( @@ -13,19 +87,16 @@ import ( const CaseBit uint8 = 'a' - 'A' var ( - ErrAddressInvalid = errors.New("the address invalid") - ErrAddressDuplicated = errors.New("the address already used") - - ErrIncomplete = errors.New("incomplete data") - ErrBadRecord = errors.New("bad TLV record format") - ErrAddressUnknown = errors.New("address unknown") - ErrDisconnected = errors.New("disconnected by user") + ErrIncomplete = errors.New("incomplete data") + ErrBadRecord = errors.New("bad TLV record format") ) -// ProbeHeader probes a TLV record header. Return values: -// - 0 0 0 incomplete header -// - '-' 0 0 bad format -// - 'A' 2 123 success +// ProbeHeader analyzes a TLV record header and extracts type and size information. +// +// Returns: +// - lit: record type ('A'-'Z', '0' for tiny, '-' for error, 0 for incomplete) +// - hdrlen: header length (1, 2, or 5 bytes) +// - bodylen: body length in bytes func ProbeHeader(data []byte) (lit byte, hdrlen, bodylen int) { if len(data) == 0 { return 0, 0, 0 @@ -60,43 +131,12 @@ func ProbeHeader(data []byte) (lit byte, hdrlen, bodylen int) { return } -// Incomplete returns the number of supposedly yet-unread bytes. -// 0 for complete, -1 for bad format, -// >0 for least-necessary read to complete either header or record. -func Incomplete(data []byte) int { - if len(data) == 0 { - return 1 // get something - } - dlit := data[0] - var bodylen int - if dlit >= '0' && dlit <= '9' { // tiny - bodylen = int(dlit - '0') - } else if dlit >= 'a' && dlit <= 'z' { // short - if len(data) < 2 { - bodylen = 2 - } else { - bodylen = int(data[1]) + 2 - } - } else if dlit >= 'A' && dlit <= 'Z' { // long - if len(data) < 5 { - bodylen = 5 - } else { - bl := binary.LittleEndian.Uint32(data[1:5]) - if bl > 0x7fffffff { - return -1 - } - bodylen = int(bl) + 5 - } - } else { - return -1 - } - if bodylen > len(data) { - return bodylen - len(data) - } else { - return 0 - } -} - +// Split parses a buffer containing multiple TLV records. +// Modifies the buffer by consuming successfully parsed records. +// +// Returns: +// - recs: slice of complete TLV records (header + body) +// - err: ErrBadRecord or ErrIncomplete func Split(data *bytes.Buffer) (recs Records, err error) { for data.Len() > 0 { lit, hlen, blen := ProbeHeader(data.Bytes()) @@ -127,20 +167,9 @@ func Split(data *bytes.Buffer) (recs Records, err error) { return } -func ProbeHeaders(lits string, data []byte) int { - rest := data - for i := 0; i < len(lits); i++ { - l, hl, bl := ProbeHeader(rest) - if l != lits[i] { - return -1 - } - rest = rest[hl+bl:] - } - return len(data) - len(rest) -} - -// Feeds the header into the buffer. -// Subtle: lower-case lit allows for defaulting, uppercase must be explicit. +// AppendHeader constructs and appends a TLV record header. +// Automatically selects format based on body length and case. +// Lowercase lit enables tiny format optimization for small bodies. func AppendHeader(into []byte, lit byte, bodylen int) (ret []byte) { biglit := lit &^ CaseBit if biglit < 'A' || biglit > 'Z' { @@ -160,8 +189,11 @@ func AppendHeader(into []byte, lit byte, bodylen int) (ret []byte) { return ret } -// Take is used to read safe TLV inputs (e.g. from own storage) with -// record types known in advance. +// Take extracts a TLV record from trusted data. Uses nil returns for errors. +// +// Returns: +// - body: record body content, nil if error +// - rest: remaining data, original data if incomplete func Take(lit byte, data []byte) (body, rest []byte) { flit, hdrlen, bodylen := ProbeHeader(data) if flit == 0 || hdrlen+bodylen > len(data) { @@ -175,7 +207,12 @@ func Take(lit byte, data []byte) (body, rest []byte) { return } -// TakeAny is used for safe TLV inputs when record types can vary. +// TakeAny extracts any TLV record from trusted data without type restrictions. +// +// Returns: +// - lit: record type found ('A'-'Z'), 0 if no data +// - body: record body content, nil if error +// - rest: remaining data, nil if error func TakeAny(data []byte) (lit byte, body, rest []byte) { if len(data) == 0 { return 0, nil, nil @@ -185,7 +222,12 @@ func TakeAny(data []byte) (lit byte, body, rest []byte) { return } -// TakeWary reads TLV records of known type from unsafe input. +// TakeWary extracts a TLV record from untrusted data with explicit error handling. +// +// Returns: +// - body: record body content, nil on error +// - rest: remaining data, original data if incomplete +// - err: ErrIncomplete or ErrBadRecord func TakeWary(lit byte, data []byte) (body, rest []byte, err error) { flit, hdrlen, bodylen := ProbeHeader(data) if flit == 0 || hdrlen+bodylen > len(data) { @@ -199,7 +241,13 @@ func TakeWary(lit byte, data []byte) (body, rest []byte, err error) { return } -// TakeWary reads TLV records of arbitrary type from unsafe input. +// TakeAnyWary extracts any TLV record from untrusted data with error handling. +// +// Returns: +// - lit: record type found ('A'-'Z'), 0 on error +// - body: record body content, nil on error +// - rest: remaining data, nil on error +// - err: ErrIncomplete for empty/insufficient data func TakeAnyWary(data []byte) (lit byte, body, rest []byte, err error) { if len(data) == 0 { return 0, nil, nil, ErrIncomplete @@ -209,32 +257,7 @@ func TakeAnyWary(data []byte) (lit byte, body, rest []byte, err error) { return } -func TakeRecord(lit byte, data []byte) (rec, rest []byte) { - flit, hdrlen, bodylen := ProbeHeader(data) - if flit == 0 || hdrlen+bodylen > len(data) { - return nil, data // Incomplete - } - if flit != lit && flit != '0' { - return nil, nil // BadRecord - } - rec = data[0 : hdrlen+bodylen] - rest = data[hdrlen+bodylen:] - return -} - -func TakeAnyRecord(data []byte) (lit byte, rec, rest []byte) { - lit, hdrlen, bodylen := ProbeHeader(data) - if lit == 0 || hdrlen+bodylen > len(data) { - return 0, nil, data // Incomplete - } - if lit == '-' { - return '-', nil, nil // BadRecord - } - rec = data[0 : hdrlen+bodylen] - rest = data[hdrlen+bodylen:] - return -} - +// TotalLen calculates the total length of multiple byte slices. func TotalLen(inputs [][]byte) (sum int) { for _, input := range inputs { sum += len(input) @@ -242,6 +265,8 @@ func TotalLen(inputs [][]byte) (sum int) { return } +// Lit extracts the canonical record type from a TLV record's first byte. +// Returns ('A'-'Z', '0' for tiny format, or '-' for invalid). func Lit(rec []byte) byte { b := rec[0] if b >= 'a' && b <= 'z' { @@ -255,8 +280,8 @@ func Lit(rec []byte) byte { } } -// Append appends a record to the buffer; note that uppercase type -// is always explicit, lowercase can be defaulted. +// Append constructs a complete TLV record and appends it to the buffer. +// Lowercase lit enables tiny format optimization. func Append(into []byte, lit byte, body ...[]byte) (res []byte) { total := TotalLen(body) res = AppendHeader(into, lit, total) @@ -266,7 +291,8 @@ func Append(into []byte, lit byte, body ...[]byte) (res []byte) { return res } -// Record composes a record of a given type +// Record creates a complete TLV record with pre-allocated capacity. +// Use Append() to add to existing buffer. func Record(lit byte, body ...[]byte) []byte { total := TotalLen(body) ret := make([]byte, 0, total+5) @@ -277,20 +303,16 @@ func Record(lit byte, body ...[]byte) []byte { return ret } -func AppendTiny(into []byte, lit byte, body []byte) (res []byte) { - if len(body) > 9 { - return Append(into, lit, body) - } - res = append(into, '0'+byte(len(body))) - res = append(res, body...) - return -} - +// TinyRecord creates a TLV record optimized for tiny format. +// Equivalent to Record() with lowercase lit. func TinyRecord(lit byte, body []byte) (tiny []byte) { - var data [10]byte - return AppendTiny(data[:0], lit, body) + // Convert to lowercase to enable tiny format optimization in AppendHeader + lowercaseLit := (lit &^ CaseBit) | CaseBit + return Record(lowercaseLit, body) } +// Join concatenates multiple TLV records into a single byte slice. +// Useful for creating compound messages or batching records. func Join(records ...[]byte) (ret []byte) { for _, rec := range records { ret = append(ret, rec...) @@ -298,13 +320,8 @@ func Join(records ...[]byte) (ret []byte) { return } -func Recs(lit byte, bodies ...[]byte) (recs Records) { - for _, body := range bodies { - recs = append(recs, Record(lit, body)) - } - return -} - +// Concat efficiently concatenates multiple byte slices with pre-allocation. +// More efficient than Join() for performance-critical code. func Concat(msg ...[]byte) []byte { total := TotalLen(msg) ret := make([]byte, 0, total) @@ -314,8 +331,31 @@ func Concat(msg ...[]byte) []byte { return ret } -// OpenHeader opens a streamed TLV record; use append() to create the -// record body, then call CloseHeader(&buf, bookmark) +// OpenHeader begins a streamed TLV record for incremental construction. +// Must be paired with CloseHeader(). Use for large or dynamic records. +// +// This function starts a TLV record with a placeholder for the body length, +// allowing the body to be built incrementally using append() operations. +// Must be paired with CloseHeader() to finalize the length field. +// +// Use this pattern for large or dynamically-sized records where the final +// body size is not known in advance. +// +// Parameters: +// - buf: buffer to append the record header to +// - lit: record type ('A'-'Z'), automatically converted to uppercase +// +// Return values: +// - bookmark: position marker needed for CloseHeader() call +// - res: buffer with the header appended (lit + 4 zero bytes for length) +// +// Usage pattern: +// +// bookmark, buf := OpenHeader(buf, 'X') +// buf = append(buf, bodyData...) // add body incrementally +// CloseHeader(buf, bookmark) // finalize the length +// +// The function always uses long format (5-byte header) for simplicity. func OpenHeader(buf []byte, lit byte) (bookmark int, res []byte) { lit &= ^CaseBit if lit < 'A' || lit > 'Z' { @@ -327,7 +367,23 @@ func OpenHeader(buf []byte, lit byte) (bookmark int, res []byte) { return len(res), res } -// CloseHeader closes a streamed TLV record +// CloseHeader finalizes a streamed TLV record by writing the actual body length. +// +// This function completes a TLV record started with OpenHeader() by calculating +// the actual body size and writing it into the length field placeholder. +// Must be called after all body data has been appended to the buffer. +// +// Parameters: +// - buf: buffer containing the TLV record with body data appended +// - bookmark: position marker returned by OpenHeader() +// +// The function: +// 1. Validates the bookmark position (must be ≥5 and ≤ buffer length) +// 2. Calculates body length as: len(buf) - bookmark +// 3. Writes the length as 4-byte little-endian uint32 at bookmark-4 position +// +// Panics if bookmark is invalid, indicating incorrect API usage. +// Always pair with OpenHeader() - never call independently. func CloseHeader(buf []byte, bookmark int) { if bookmark < 5 || len(buf) < bookmark { panic("check the API docs") diff --git a/rdx/FIRST.go b/rdx/FIRST.go index ae5083f..5488618 100644 --- a/rdx/FIRST.go +++ b/rdx/FIRST.go @@ -64,7 +64,7 @@ func IsFirst(c byte) bool { func FIRSTtlv(rev int64, src uint64, value []byte) (bulk []byte) { time := ZipIntUint64Pair(rev, src) bulk = make([]byte, 0, len(time)+len(value)+2) - bulk = protocol.AppendTiny(bulk, 'T', time) + bulk = protocol.Append(bulk, 't', time) bulk = append(bulk, value...) return } diff --git a/rdx/README.md b/rdx/README.md deleted file mode 100644 index acf9742..0000000 --- a/rdx/README.md +++ /dev/null @@ -1,284 +0,0 @@ -# Replicated Data Interchange (RDX CRDT) library - -Our goal here is to create a format and a library for data -replication using state-of-the-art Replicated Data Types. -Replicated Data interchange format ([RDX][j]) is like protobuf, -but CRDT. Apart from [RPC][p] applications, one can use it for -data storage, distributed and asynchronous data exchange and in -other similar applications. RDX fully supports local-first, -offline-first and peer-to-peer replication, with no central -server required, as any two *replicas* can merge their data. By -installing RDX data types as merge operators in an LSM database -(leveldb, RocksDB, pebble, Cassandra, etc) one can effectively -have a CRDT database (which [Chotki](./ARCHITECTURE.md) basically is). - -We will implement *unified* CRDTs able to synchronize using -operations, full states or deltas. Types may imply [causal -consistency][x] of updates in matters of performance, but their -correctness does not depend on that. RDX data types are fully -commutative, associative and idempotent. Hence, immune to -reordering or duplication of updates. - -The default syncing protocol (not described here) generally -relies on [version vectors][v]. Do not confuse that with [vector -clocks][r] used by Amazon Dynamo and similar systems. While -there are strong parallels, inner workings of VV and VC are not -identical. - -## Data types - -Our objects can have fields of the following CRDT types. Each -type is named by a letter. - - 1. last-write-wins variables (`I` for int64, `S` for string, `F` - is float64, and `R` is [id64](./id.go#12)) - 2. counters, `N` increment-only uint64 and `Z` two-way int64 - 3. maps (M), like key-value maps, where keys and values are `FIRST` - 4. sets (E), contain arbitrary `FIRST` elements - 5. arrays (L) of arbitrary `FIRST` elements - 6. version vectors (V) - 7. codegen - -The format and the merge rules are as follows. - -### `FIRST` Float, Integer, Reference, String, Term - -The last-write-wins register is the simplest data type to -implement. For each LWW field, we only need the latest "winner" -op containing the logical timestamp and the value per se. A -logical timestamp is a pair `{rev, src}` where `rev` is the -revision number and `src` is the id of the author. For example, -let's see how a bare (no TLV envelope) `I` int64 `-11` would -look like, assuming it is the 4th revision of the register -autored by replica #5. The TLV would look like: `32 08 05 15` -(hex) where `0x15` is a [zig-zag][g] encoded and zipped `-11`, -while `32 08 05` is a tiny [ToyTLV](../protocol/tlv.go) record for a zipped pair -of ints, 4 (signed, zig-zagged, so `08`) and 5 (unsigned, so -`05`). If we add a ToyTLV envelope, that becomes `69 04 32 08 05 -15` (type of record `I`, length 4, then the bare part). - -String `S` values are simply UTF-8 strings. Int64 `I`, float64 -`F` and id64 `R` values get compressed using [`zip_int`](./zipint.go) -routines. Overlong encodings are forbidden both for strings and -for zip-ints! - -`T` ops have a timestamp, but no value. That is the equivalent -of a `nil` or `void` value. Those are used as placeholders in -various cases. - -The string value for `FIRST` types is as follows: - - 1. `F` the e-notation, JSON-like - 2. `I` signed integer notation, - 3. `R` 5-8-3 hex notation (e.g. `c187-3a62-12`) - 4. `S` double-quoted JSON-like, e.g. "Sarah O\'Connor" - 5. `T` null - -Merge rules for LWW are straighforward: - - 1. higher revision wins - 2. in case of a tie, higher value wins (like bytes.Compare()) - 3. in case of a tie, who cares, but higher replica id wins - -### `NZ` Counters - -`N` are increment-only counters. Their TLV state is a sequence -of `T` records containing zipped uint64 pairs {val,src}, the -counter value and source replica id. As the counter is inc-only, -we may use the value itself as a revision number. The merge -operator is per-replica `max`, as later versions are greater. -The native value is the sum of all replica values (sum of -contributions). - -`Z` are two-way counters (inc/dec). Their TLV format is a -sequence of `I` records each having `{rev,src}` metadata as -described in the `FIRST` section. One record corresponds to one -source, per-source merge rules are same as LWW. The native value -is the sum of all `I` values. - -### `E` Eulerian - -Generic sets containing any `FIRST` elements. The TLV format is -a sequence of enveloped FIRST records. It can contain records -with negative revision numbers. Those are tombstones (deleted -entries). For example, `I{4,5}-11` from the `FIRST` example -would go as `69 04 32 08 05 15`. Then, if replica #3 would want -to remove that entry, it will issue a tombstone op `I{-5,3}-11` -or `69 04 32 09 03 15`. Here, the version number changes from -`08` to `09` or 4 to -5, the author changes to 3. - -Within a set, the ops are sorted in the *value order*. Namely, -if the type differs, they go in the alphabetical order (`F`, -`I`, `R`, `S`, `T`). If the type is the same, they go in the -ascending order, as per `strcmp` or `bytes.Compare`. That way, -merging multiple versions of a set only requires one parallel -pass of those, no additional allocation or sorting, very much -like [mergesort][m] works. - -The string value for a set is like `{1,2,3}` where `1,2,3` are -`FIRST` elements of the set. - -### `M` Mapping - -Generic maps, mapping any `FIRST` value to any other `FIRST` -value. The TLV format is a sequence of enveloped key-value op -pairs. Any update should also contain the affected key-value -pairs. Deleted entries might have `T` values (the key is -present, the value is null) or the key might have a negative -revision (no such key present). - -Pairs are sorted in the value-order of their keys. When merging -two pairs having an identical value of their keys, both the key -and the value ops are merged according to the LWW rules. As with -`E` sets, this only requires one parallel pass of the versions. - -The string value for a map is like `{4:null, "key":"value"}` - -### `L` Linear - -Generic arrays store any `FIRST` elements. Internally, `L` are -Causal Trees (also known as Replicated Growable Arrays, RGAs). -The TLV format is a sequence of enveloped FIRST ops. The -order of the sequence is a *weave*, i.e. ops go in the same -order as they appear(ed) in the resulting array. Deleted ops -change to tombstones, same as E. - -The merging procedure follows the tree-traversal logic. Any -change to an array must have a form of *subtrees*, each one -arranged in the same weave order, each one prepended with a `T` -op specifying its attachment point in the edited tree. - -Deletions look like `T` ops with negative revision numbers. As -an example, suppose we have an array authored by #3 `I{1,3}1 -I{2,3}2 I{3,3}3` or `[1,2,3]` and replica #4 wants to delete the -first entry. Then, it issues a patch `T{1,3}T{-4,4}` that merges -to produce `I{1,3}1 T{-4,4} I{2,3}2 I{3,3}3` or `[2,3]`. - -The string value for an array is like `[1,2,3]` - -### `V` Version vector - -[Version vector][v] is a way to track dataset versions in a -causally ordered system. It is a vector of `seq` numbers, where -each `seq` is the version of the state as seen by each -respective replica. Alternatively, that is a map `{src: seq}`, -where `src` is the replica `id`. It is assumed, that we received -updates from replica `src` all the way up to `seq`. - -Bare TLV for a version vector is a sequence of `V` records (yes, -`V` nested in `V`) each containing one id64 as a zipped seq-src -pair (see ZipUint64Pair). The sequence is sorted in the -ascenting order of record bytes, like `bytes.Compare()`. - -The merge algorithm for version vectors is simple: take the -maximum `seq` for each `src`. Note that `seq=0` is distinct from -having no record. - -## Data type implementation - -To fully implement an RDT one has to implement these 10 -functions. The function name starts with the type name letter, -here we imply `I` last-write-wins int64. - -````go - // Xvalid verifies validity of a bare TLV record. - // Any other function may assume the input is valid. - func Ivalid(tlv []byte) bool - - - // Xstring converts a TLV representation into a string. - func Istring(tlv []byte) (txt string) - - // Xparse converts a string back into bare TLV. - // Must round-trip with Xstring. - func Iparse(txt string) (tlv []byte) - - - // Xtlv converts the native type into a TLV, zero metadata. - func Itlv(i int64) (tlv []byte) - - // Xnative converts TLV into the native value. - // Must round-trip with Xtlv. - func Inative(tlv []byte) int64 - - - // Xdelta produces a TLV value that, once merged with - // the old TLV value using Xmerge, will produce the new - // native value using Xnative. Returns nil if none needed. - // This function we need to *save changes* from a native - // object/struct into RDX. - func Idelta(tlv []byte, new_val int64) (tlv_delta []byte) - - // Xmerge CRDT-merges several bare TLV values into the - // resulting one. For example, given two I records - // {3,8}15 and {4,1}44 will return {4,1}44 as version 4 is - // newer than version 3. - func Imerge(tlvs [][]byte) (tlv []byte) - - // Xdiff produces a TLV delta given a TLV value and a - // version vector of suspected changes (may skip this). - func Idiff(tlv []byte, vvdiff VV) (tlv []byte) -```` - -## Serialization format - -We use the [ToyTLV](../protocol/tlv.go) format for enveloping/nesting all data. -That is a bare-bones type-length-value format with zero -semantics. What we put into ToyTLV envelopes is integers, -strings, and floats. Strings are UTF-8, no surprises. Floats are -taken as raw bits and treated same as integers. id64 is stored -as a compressed pair of integers. - -A note on integer compression. From the fact that protobuf -has about ten integer types, one can guess that things can -be complicated here. We use [ZipInt](./zipint.go) routines to produce -efficient varints in a TLV format (differently from protobuf -which has a separate bit-level [LEB128][b] coding for ints). - - - ZipUint64 packs an integer skipping all leading zeroes - - ZipUint64Pair packs a pair of ints, each one taking 1,2,4 or - 8 bytes - - ZipZagInt64 packs a signed integer using the zig-zag coding - - ZipFloat64 packs a float (integers and binary fractions pack - well) - -id64 and logical timestamps get packed as pairs of uint64s. All -zip codings are little-endian. - -## Enveloping - -RDX values can be bare, enveloped or double-enveloped. We use -bare values when we already know what field of what object we -are dealing with and what RDT it belongs to. That might be the -case when we read a value from a key-value storage where the key -contains object id, field and RDT. In such a case, a bare -Integer is like `{3,2}1` or `32 03 02 02`. - -Within a network packet, that integer may need to be -single-enveloped: `I({3,2}1)` or `69 04 32 03 02 02` assuming -the other metadata is known from the context. - -A bare `ELM` or `NZ` value would only contain a sequence of -single-enveloped `FIRST` values. To make that single-enveloped -we only prepend a TLV header. - -In case we also have to convey the rest of the metadata, namely -the object id and the field, we have to use the double-enveloped -form. For a simple `map[string]string{"Key":"Value"}` that -looks like: `M({b0b-af0-3} S({0,0}"Key") S({0,0}"Value"))` or -`6D 15 36 03 00 af 00 0b 0b 73 04 30 4b 65 79 73 06 30 56 61 6c 75 65`. -For `FIRST` values, there is no need to use two nested TLV -records, so a double-enveloped Integer looks like: -`I({b0b-af0-7}{3,2}1)` - -Object/fields ids are serialized as tiny `ZipUint64Pair`s. -Revisions are serialized as tiny `ZipIntUint64Pair`s. - -[x]: https://en.wikipedia.org/wiki/Causal_consistency -[v]: https://en.wikipedia.org/wiki/Version_vector -[r]: https://www.educative.io/answers/how-are-vector-clocks-used-in-dynamo -[j]: https://en.wikipedia.org/wiki/RDX -[p]: https://en.wikipedia.org/wiki/Remote_procedure_call -[g]: https://protobuf.dev/programming-guides/encoding/ -[b]: https://en.wikipedia.org/wiki/LEB128 -[m]: https://en.wikipedia.org/wiki/Merge_sort diff --git a/rdx/_doc.go b/rdx/_doc.go new file mode 100644 index 0000000..305129c --- /dev/null +++ b/rdx/_doc.go @@ -0,0 +1,283 @@ +// # Replicated Data Interchange (RDX CRDT) library + +// Our goal here is to create a format and a library for data +// replication using state-of-the-art Replicated Data Types. +// Replicated Data interchange format ([RDX][j]) is like protobuf, +// but CRDT. Apart from [RPC][p] applications, one can use it for +// data storage, distributed and asynchronous data exchange and in +// other similar applications. RDX fully supports local-first, +// offline-first and peer-to-peer replication, with no central +// server required, as any two *replicas* can merge their data. By +// installing RDX data types as merge operators in an LSM database +// (leveldb, RocksDB, pebble, Cassandra, etc) one can effectively +// have a CRDT database (which [Chotki](./ARCHITECTURE.md) basically is). + +// We will implement *unified* CRDTs able to synchronize using +// operations, full states or deltas. Types may imply [causal +// consistency][x] of updates in matters of performance, but their +// correctness does not depend on that. RDX data types are fully +// commutative, associative and idempotent. Hence, immune to +// reordering or duplication of updates. + +// The default syncing protocol (not described here) generally +// relies on [version vectors][v]. Do not confuse that with [vector +// clocks][r] used by Amazon Dynamo and similar systems. While +// there are strong parallels, inner workings of VV and VC are not +// identical. + +// ## Data types + +// Our objects can have fields of the following CRDT types. Each +// type is named by a letter. + +// 1. last-write-wins variables (`I` for int64, `S` for string, `F` +// is float64, and `R` is [id64](./id.go#12)) +// 2. counters, `N` increment-only uint64 and `Z` two-way int64 +// 3. maps (M), like key-value maps, where keys and values are `FIRST` +// 4. sets (E), contain arbitrary `FIRST` elements +// 5. arrays (L) of arbitrary `FIRST` elements +// 6. version vectors (V) +// 7. codegen + +// The format and the merge rules are as follows. + +// ### `FIRST` Float, Integer, Reference, String, Term + +// The last-write-wins register is the simplest data type to +// implement. For each LWW field, we only need the latest "winner" +// op containing the logical timestamp and the value per se. A +// logical timestamp is a pair `{rev, src}` where `rev` is the +// revision number and `src` is the id of the author. For example, +// let's see how a bare (no TLV envelope) `I` int64 `-11` would +// look like, assuming it is the 4th revision of the register +// autored by replica #5. The TLV would look like: `32 08 05 15` +// (hex) where `0x15` is a [zig-zag][g] encoded and zipped `-11`, +// while `32 08 05` is a tiny [ToyTLV](../protocol/tlv.go) record for a zipped pair +// of ints, 4 (signed, zig-zagged, so `08`) and 5 (unsigned, so +// `05`). If we add a ToyTLV envelope, that becomes `69 04 32 08 05 +// 15` (type of record `I`, length 4, then the bare part). + +// String `S` values are simply UTF-8 strings. Int64 `I`, float64 +// `F` and id64 `R` values get compressed using [`zip_int`](./zipint.go) +// routines. Overlong encodings are forbidden both for strings and +// for zip-ints! + +// `T` ops have a timestamp, but no value. That is the equivalent +// of a `nil` or `void` value. Those are used as placeholders in +// various cases. + +// The string value for `FIRST` types is as follows: + +// 1. `F` the e-notation, JSON-like +// 2. `I` signed integer notation, +// 3. `R` 5-8-3 hex notation (e.g. `c187-3a62-12`) +// 4. `S` double-quoted JSON-like, e.g. "Sarah O\'Connor" +// 5. `T` null + +// Merge rules for LWW are straighforward: + +// 1. higher revision wins +// 2. in case of a tie, higher value wins (like bytes.Compare()) +// 3. in case of a tie, who cares, but higher replica id wins + +// ### `NZ` Counters + +// `N` are increment-only counters. Their TLV state is a sequence +// of `T` records containing zipped uint64 pairs {val,src}, the +// counter value and source replica id. As the counter is inc-only, +// we may use the value itself as a revision number. The merge +// operator is per-replica `max`, as later versions are greater. +// The native value is the sum of all replica values (sum of +// contributions). + +// `Z` are two-way counters (inc/dec). Their TLV format is a +// sequence of `I` records each having `{rev,src}` metadata as +// described in the `FIRST` section. One record corresponds to one +// source, per-source merge rules are same as LWW. The native value +// is the sum of all `I` values. + +// ### `E` Eulerian + +// Generic sets containing any `FIRST` elements. The TLV format is +// a sequence of enveloped FIRST records. It can contain records +// with negative revision numbers. Those are tombstones (deleted +// entries). For example, `I{4,5}-11` from the `FIRST` example +// would go as `69 04 32 08 05 15`. Then, if replica #3 would want +// to remove that entry, it will issue a tombstone op `I{-5,3}-11` +// or `69 04 32 09 03 15`. Here, the version number changes from +// `08` to `09` or 4 to -5, the author changes to 3. + +// Within a set, the ops are sorted in the *value order*. Namely, +// if the type differs, they go in the alphabetical order (`F`, +// `I`, `R`, `S`, `T`). If the type is the same, they go in the +// ascending order, as per `strcmp` or `bytes.Compare`. That way, +// merging multiple versions of a set only requires one parallel +// pass of those, no additional allocation or sorting, very much +// like [mergesort][m] works. + +// The string value for a set is like `{1,2,3}` where `1,2,3` are +// `FIRST` elements of the set. + +// ### `M` Mapping + +// Generic maps, mapping any `FIRST` value to any other `FIRST` +// value. The TLV format is a sequence of enveloped key-value op +// pairs. Any update should also contain the affected key-value +// pairs. Deleted entries might have `T` values (the key is +// present, the value is null) or the key might have a negative +// revision (no such key present). + +// Pairs are sorted in the value-order of their keys. When merging +// two pairs having an identical value of their keys, both the key +// and the value ops are merged according to the LWW rules. As with +// `E` sets, this only requires one parallel pass of the versions. + +// The string value for a map is like `{4:null, "key":"value"}` + +// ### `L` Linear + +// Generic arrays store any `FIRST` elements. Internally, `L` are +// Causal Trees (also known as Replicated Growable Arrays, RGAs). +// The TLV format is a sequence of enveloped FIRST ops. The +// order of the sequence is a *weave*, i.e. ops go in the same +// order as they appear(ed) in the resulting array. Deleted ops +// change to tombstones, same as E. + +// The merging procedure follows the tree-traversal logic. Any +// change to an array must have a form of *subtrees*, each one +// arranged in the same weave order, each one prepended with a `T` +// op specifying its attachment point in the edited tree. + +// Deletions look like `T` ops with negative revision numbers. As +// an example, suppose we have an array authored by #3 `I{1,3}1 +// I{2,3}2 I{3,3}3` or `[1,2,3]` and replica #4 wants to delete the +// first entry. Then, it issues a patch `T{1,3}T{-4,4}` that merges +// to produce `I{1,3}1 T{-4,4} I{2,3}2 I{3,3}3` or `[2,3]`. + +// The string value for an array is like `[1,2,3]` + +// ### `V` Version vector + +// [Version vector][v] is a way to track dataset versions in a +// causally ordered system. It is a vector of `seq` numbers, where +// each `seq` is the version of the state as seen by each +// respective replica. Alternatively, that is a map `{src: seq}`, +// where `src` is the replica `id`. It is assumed, that we received +// updates from replica `src` all the way up to `seq`. + +// Bare TLV for a version vector is a sequence of `V` records (yes, +// `V` nested in `V`) each containing one id64 as a zipped seq-src +// pair (see ZipUint64Pair). The sequence is sorted in the +// ascenting order of record bytes, like `bytes.Compare()`. + +// The merge algorithm for version vectors is simple: take the +// maximum `seq` for each `src`. Note that `seq=0` is distinct from +// having no record. + +// ## Data type implementation + +// To fully implement an RDT one has to implement these 10 +// functions. The function name starts with the type name letter, +// here we imply `I` last-write-wins int64. + +// ````go +// // Xvalid verifies validity of a bare TLV record. +// // Any other function may assume the input is valid. +// func Ivalid(tlv []byte) bool + +// // Xstring converts a TLV representation into a string. +// func Istring(tlv []byte) (txt string) + +// // Xparse converts a string back into bare TLV. +// // Must round-trip with Xstring. +// func Iparse(txt string) (tlv []byte) + +// // Xtlv converts the native type into a TLV, zero metadata. +// func Itlv(i int64) (tlv []byte) + +// // Xnative converts TLV into the native value. +// // Must round-trip with Xtlv. +// func Inative(tlv []byte) int64 + +// // Xdelta produces a TLV value that, once merged with +// // the old TLV value using Xmerge, will produce the new +// // native value using Xnative. Returns nil if none needed. +// // This function we need to *save changes* from a native +// // object/struct into RDX. +// func Idelta(tlv []byte, new_val int64) (tlv_delta []byte) + +// // Xmerge CRDT-merges several bare TLV values into the +// // resulting one. For example, given two I records +// // {3,8}15 and {4,1}44 will return {4,1}44 as version 4 is +// // newer than version 3. +// func Imerge(tlvs [][]byte) (tlv []byte) + +// // Xdiff produces a TLV delta given a TLV value and a +// // version vector of suspected changes (may skip this). +// func Idiff(tlv []byte, vvdiff VV) (tlv []byte) +// ```` + +// ## Serialization format + +// We use the [ToyTLV](../protocol/tlv.go) format for enveloping/nesting all data. +// That is a bare-bones type-length-value format with zero +// semantics. What we put into ToyTLV envelopes is integers, +// strings, and floats. Strings are UTF-8, no surprises. Floats are +// taken as raw bits and treated same as integers. id64 is stored +// as a compressed pair of integers. + +// A note on integer compression. From the fact that protobuf +// has about ten integer types, one can guess that things can +// be complicated here. We use [ZipInt](./zipint.go) routines to produce +// efficient varints in a TLV format (differently from protobuf +// which has a separate bit-level [LEB128][b] coding for ints). + +// - ZipUint64 packs an integer skipping all leading zeroes +// - ZipUint64Pair packs a pair of ints, each one taking 1,2,4 or +// 8 bytes +// - ZipZagInt64 packs a signed integer using the zig-zag coding +// - ZipFloat64 packs a float (integers and binary fractions pack +// well) + +// id64 and logical timestamps get packed as pairs of uint64s. All +// zip codings are little-endian. + +// ## Enveloping + +// RDX values can be bare, enveloped or double-enveloped. We use +// bare values when we already know what field of what object we +// are dealing with and what RDT it belongs to. That might be the +// case when we read a value from a key-value storage where the key +// contains object id, field and RDT. In such a case, a bare +// Integer is like `{3,2}1` or `32 03 02 02`. + +// Within a network packet, that integer may need to be +// single-enveloped: `I({3,2}1)` or `69 04 32 03 02 02` assuming +// the other metadata is known from the context. + +// A bare `ELM` or `NZ` value would only contain a sequence of +// single-enveloped `FIRST` values. To make that single-enveloped +// we only prepend a TLV header. + +// In case we also have to convey the rest of the metadata, namely +// the object id and the field, we have to use the double-enveloped +// form. For a simple `map[string]string{"Key":"Value"}` that +// looks like: `M({b0b-af0-3} S({0,0}"Key") S({0,0}"Value"))` or +// `6D 15 36 03 00 af 00 0b 0b 73 04 30 4b 65 79 73 06 30 56 61 6c 75 65`. +// For `FIRST` values, there is no need to use two nested TLV +// records, so a double-enveloped Integer looks like: +// `I({b0b-af0-7}{3,2}1)` + +// Object/fields ids are serialized as tiny `ZipUint64Pair`s. +// Revisions are serialized as tiny `ZipIntUint64Pair`s. + +// [x]: https://en.wikipedia.org/wiki/Causal_consistency +// [v]: https://en.wikipedia.org/wiki/Version_vector +// [r]: https://www.educative.io/answers/how-are-vector-clocks-used-in-dynamo +// [j]: https://en.wikipedia.org/wiki/RDX +// [p]: https://en.wikipedia.org/wiki/Remote_procedure_call +// [g]: https://protobuf.dev/programming-guides/encoding/ +// [b]: https://en.wikipedia.org/wiki/LEB128 +// [m]: https://en.wikipedia.org/wiki/Merge_sort + +package rdx diff --git a/rdx/rdt.go b/rdx/rdt.go deleted file mode 100644 index 93925fc..0000000 --- a/rdx/rdt.go +++ /dev/null @@ -1,14 +0,0 @@ -package rdx - -type RDT interface { - Merge(tlvs [][]byte) - State() (tlv []byte) - - String() string - ToString(txt string, src uint64) error - - Native() interface{} - ToNative(new_val interface{}, src uint64) (delta []byte) - - Diff(vvdiff VV) (diff []byte) -} diff --git a/repl/commands.go b/repl/commands.go index 6e7bd12..534691b 100644 --- a/repl/commands.go +++ b/repl/commands.go @@ -7,10 +7,10 @@ import ( "net/http" "os" "path/filepath" - "time" "github.com/cockroachdb/pebble" "github.com/drpcorg/chotki" + "github.com/drpcorg/chotki/classes" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" ) @@ -85,7 +85,6 @@ func (repl *REPL) CommandCreate(arg *rdx.RDX) (id rdx.ID, err error) { Name: name, Options: pebble.Options{ErrorIfExists: true}, }) - go repl.Host.KeepAliveLoop() if err == nil { id = repl.Host.Last() } @@ -110,7 +109,6 @@ func (repl *REPL) CommandOpen(arg *rdx.RDX) (rdx.ID, error) { if err != nil { return rdx.BadId, err } - go repl.Host.KeepAliveLoop() return repl.Host.Last(), nil } @@ -128,7 +126,6 @@ func (repl *REPL) CommandOpenDir(arg *rdx.RDX) (rdx.ID, error) { if err != nil { return rdx.BadId, err } - go repl.Host.KeepAliveLoop() return repl.Host.Last(), nil } @@ -263,7 +260,7 @@ func (repl *REPL) CommandNew(arg *rdx.RDX) (id rdx.ID, err error) { } pairs = pairs[2:] } - var fields chotki.Fields + var fields classes.Fields fields, err = repl.Host.ClassFields(tid) if err != nil { return @@ -479,227 +476,6 @@ func (repl *REPL) CommandConnect(arg *rdx.RDX) (id rdx.ID, err error) { return } -var HelpPing = errors.New("ping b0b-12-1 // S field id") - -func (repl *REPL) CommandPing(arg *rdx.RDX) (id rdx.ID, err error) { - if arg == nil || arg.RdxType != rdx.Reference { - return rdx.BadId, HelpPing - } - fid := rdx.IDFromText(arg.Text) - oid := fid.ZeroOff() - _, form, fact, e := repl.Host.ObjectFields(oid) - if e != nil { - return rdx.BadId, e - } - off := fid.Off() - if off == 0 || int(off) > len(form) { - return rdx.BadId, HelpPing - } - if form[off].RdxType != rdx.String { - return rdx.BadId, errors.New(form[off].Name + " is not a string") - } - fmt.Printf("pinging through %s (field %s, previously %s)\n", - fid.String(), form[off].Name, rdx.Snative(fact[off])) - id, err = repl.Host.SetFieldTLV(context.Background(), fid, protocol.Record('S', rdx.Stlv("ping"))) - return -} - -var HelpPinc = errors.New("pinc b0b-12-2") -var ErrBadField = errors.New("bad field") - -func KeepOddEven(oddeven uint64, cho *chotki.Chotki, fid rdx.ID) error { - rdt, tlv, err := cho.ObjectFieldTLV(fid) - if err != nil || rdt != rdx.Natural { - return ErrBadField - } - src := cho.Source() - mine := rdx.Nmine(tlv, src) - sum := rdx.Nnative(tlv) - if (sum & 1) != oddeven { - tlvs := protocol.Records{ - protocol.Record('F', rdx.ZipUint64(fid.Off())), - protocol.Record(rdx.Natural, protocol.Record(rdx.Term, rdx.ZipUint64Pair(mine+1, src))), - } - _, err = cho.CommitPacket(context.Background(), 'E', fid.ZeroOff(), tlvs) - } - return err -} - -func KeepOdd(cho *chotki.Chotki, fid rdx.ID) error { - return KeepOddEven(1, cho, fid) -} - -func KeepEven(cho *chotki.Chotki, fid rdx.ID) error { - return KeepOddEven(0, cho, fid) -} - -func (repl *REPL) CommandPinc(arg *rdx.RDX) (id rdx.ID, err error) { - id, err = rdx.BadId, HelpPinc - if arg == nil || arg.RdxType != rdx.Reference { - return - } - fid := rdx.IDFromText(arg.Text) - if fid.Off() == 0 { - return - } - err = KeepOdd(repl.Host, fid) - if err != nil { - return - } - repl.Host.AddHook(fid, KeepOdd) - id = fid - err = nil - return -} - -func (repl *REPL) CommandPonc(arg *rdx.RDX) (id rdx.ID, err error) { - id, err = rdx.BadId, HelpPinc - if arg == nil || arg.RdxType != rdx.Reference { - return - } - fid := rdx.IDFromText(arg.Text) - if fid.Off() == 0 { - return - } - err = KeepEven(repl.Host, fid) - if err != nil { - return - } - repl.Host.AddHook(fid, KeepEven) - id = fid - err = nil - return -} - -func (repl *REPL) CommandMute(arg *rdx.RDX) (id rdx.ID, err error) { - id, err = rdx.BadId, HelpPinc - if arg == nil || arg.RdxType != rdx.Reference { - return - } - fid := rdx.IDFromText(arg.Text) - if fid.Off() == 0 { - return - } - repl.Host.RemoveAllHooks(fid) - id = rdx.ID0 - err = nil - return -} - -var HelpTinc = errors.New("tinc b0b-12-2, tinc {fid:b0b-12-2,ms:1000,count:100}") - -func (repl *REPL) doTinc(fid rdx.ID, delay time.Duration, count int64) { - var err error - for ; count > 0 && err == nil; count-- { - _, err = repl.Host.IncNField(context.Background(), fid) - if delay > time.Duration(0) { - time.Sleep(delay) - } - } -} - -// testing: read-inc loop -func (repl *REPL) CommandTinc(arg *rdx.RDX) (id rdx.ID, err error) { - id, err = rdx.BadId, HelpTinc - count := int64(1) - delay := time.Second - if arg == nil { - return - } else if arg.RdxType == rdx.Reference { - id = rdx.IDFromText(arg.Text) - } else if arg.RdxType == rdx.Mapping { - for i := 0; i+1 < len(arg.Nested); i += 2 { - key := &arg.Nested[i] - val := &arg.Nested[i+1] - switch key.String() { - case "fid": - id = rdx.IDFromText(val.Text) - case "ms": - ms := rdx.Inative(rdx.Iparse(val.String())) - delay = time.Millisecond * time.Duration(ms) - case "count": - count = rdx.Inative(rdx.Iparse(val.String())) - default: - return - } - } - } else { - return - } - err = nil - go repl.doTinc(id, delay, count) - return -} - -var HelpSinc = errors.New("sinc b0b-12-2, tinc {fid:b0b-12-2,ms:1000,count:100}") - -func (repl *REPL) doSinc(fid rdx.ID, delay time.Duration, count int64, mine uint64) { - var err error - start := time.Now() - fro := repl.Host.Last() - src := fro.Src() - til := rdx.ID0 - for c := count; c > 0 && err == nil; c-- { - mine++ - tlvs := protocol.Records{ - protocol.Record('F', rdx.ZipUint64(fid.Off())), - protocol.Record(rdx.Natural, protocol.Record(rdx.Term, rdx.ZipUint64Pair(mine, src))), - } - til, err = repl.Host.CommitPacket(context.Background(), 'E', fid.ZeroOff(), tlvs) - if delay > time.Duration(0) { - time.Sleep(delay) - } - } - if err != nil { - fmt.Println(err.Error()) - } - timer := time.Since(start) - _, _ = fmt.Fprintf(os.Stdout, "inc storm: %d incs complete for %s, elapsed %s, %s..%s\n", - count, fid.String(), timer.String(), fro.String(), til.String()) -} - -func (repl *REPL) CommandSinc(arg *rdx.RDX) (id rdx.ID, err error) { - id, err = rdx.BadId, HelpSinc - count := int64(1) - delay := time.Second - if arg == nil { - return - } else if arg.RdxType == rdx.Reference { - id = rdx.IDFromText(arg.Text) - } else if arg.RdxType == rdx.Mapping { - for i := 0; i+1 < len(arg.Nested); i += 2 { - key := &arg.Nested[i] - val := &arg.Nested[i+1] - switch key.String() { - case "fid": - id = rdx.IDFromText(val.Text) - if id.Off() == 0 { - return rdx.BadId, chotki.ErrWrongFieldType - } - case "ms": - ms := rdx.Inative(rdx.Iparse(val.String())) - delay = time.Millisecond * time.Duration(ms) - case "count": - count = rdx.Inative(rdx.Iparse(val.String())) - default: - return - } - } - } else { - return - } - - rdt, tlv, err := repl.Host.ObjectFieldTLV(id) - if err != nil || rdt != rdx.Natural { - return rdx.BadId, chotki.ErrWrongFieldType - } - src := repl.Host.Source() - mine := rdx.Nmine(tlv, src) - - go repl.doSinc(id, delay, count, mine) - return -} - var HelpTell = errors.New("tell b0b-12-1") func (repl *REPL) CommandTell(arg *rdx.RDX) (id rdx.ID, err error) { @@ -746,90 +522,6 @@ func (repl *REPL) CommandName(arg *rdx.RDX) (id rdx.ID, err error) { return } -func (repl *REPL) CommandValid(arg *rdx.RDX) (id rdx.ID, err error) { - it := repl.Host.Database().NewIter(&pebble.IterOptions{ - LowerBound: []byte{'O'}, - UpperBound: []byte{'P'}, - }) - key := chotki.OKey(rdx.ID0, 'A') - for it.SeekGE(key); it.Valid(); it.Next() { - id, rdt := chotki.OKeyIdRdt(it.Key()) - val, e := it.ValueAndErr() - if e != nil { - _, _ = fmt.Fprintf(os.Stderr, "record read fail %s\n", e.Error()) - continue - } - if !rdx.Xvalid(rdt, val) { - fmt.Printf("%c record is not valid at %s\n", rdt, id.String()) - } - } - _ = it.Close() - return rdx.ID0, nil -} - -var HelpWhoSaw = errors.New("whosaw b0b-2}") - -func (repl *REPL) CommandWhoSaw(arg *rdx.RDX) (id rdx.ID, err error) { - if arg == nil || arg.RdxType != rdx.Reference { - return rdx.BadId, HelpWhoSaw - } - var vv rdx.VV - vv, err = repl.Host.VersionVector() - if err != nil { - return - } - idq := rdx.IDFromText(arg.Text) - for src := range vv { - oid := rdx.IDfromSrcPro(src, chotki.YAckOff) - var tlv []byte - tlv, err = repl.Host.ObjectRDTFieldTLV(oid, 'V') - if err != nil { - continue - } - revv := make(rdx.VV) - _ = revv.PutTLV(tlv) - pro := revv[idq.Src()] - if pro >= idq.Pro() { - fmt.Println(rdx.IDfromSrcPro(src, 0).String()) - } - } - return rdx.ID0, nil -} - -var HelpCompile = errors.New("choc ClassName, choc b0b-2, choc {ClassName:b0b-2}") - -func (repl *REPL) CommandCompile(arg *rdx.RDX) (id rdx.ID, err error) { - id = rdx.ID0 - var code string - orm := repl.Host.ObjectMapper() - if arg == nil { - return rdx.BadId, HelpCompile - } else if arg.RdxType == rdx.Reference { - id = rdx.IDFromText(arg.Text) - code, err = orm.Compile("SomeClass", id) - //repl.Host.CompileClass("SomeClass", id) - } else if arg.RdxType == rdx.Mapping { - m := arg.Nested - for i := 0; i+1 < len(m) && err == nil; i += 2 { - name := &m[i] - cidx := &m[i+1] - if name.RdxType != rdx.Term || cidx.RdxType != rdx.Reference { - return rdx.BadId, HelpCompile - } - cid := rdx.IDFromText(cidx.Text) - c := "" - c, err = orm.Compile(string(name.Text), cid) - code = code + c - } - } else { - return - } - if err == nil { - fmt.Println(code) - } - return -} - func (repl *REPL) CommandSwagger(arg *rdx.RDX) (id rdx.ID, err error) { mux := http.NewServeMux() fs := http.FileServer(http.Dir("./swagger")) diff --git a/repl/repl.go b/repl/repl.go index eae5f00..9cf5d89 100644 --- a/repl/repl.go +++ b/repl/repl.go @@ -162,8 +162,6 @@ func (repl *REPL) REPL(line string) (id rdx.ID, err error) { id, err = repl.CommandInc(arg) case "add": id, err = repl.CommandAdd(arg) - case "choc": - id, err = repl.CommandCompile(arg) // ----- networking ----- case "listen": id, err = repl.CommandListen(arg) @@ -174,25 +172,6 @@ func (repl *REPL) REPL(line string) (id rdx.ID, err error) { id, err = repl.CommandDump(arg) case "tell": id, err = repl.CommandTell(arg) - case "ping": - id, err = repl.CommandPing(arg) - case "pong": - // args[1] is an object/field - // subscribe - case "pinc": - id, err = repl.CommandPinc(arg) - case "ponc": - id, err = repl.CommandPonc(arg) - case "mute": - id, err = repl.CommandMute(arg) - case "tinc": - id, err = repl.CommandTinc(arg) - case "sinc": - id, err = repl.CommandSinc(arg) - case "valid": - id, err = repl.CommandValid(arg) - case "whosaw": - id, err = repl.CommandWhoSaw(arg) default: _, _ = fmt.Fprintf(os.Stderr, "command unknown: %s\n", cmd) } diff --git a/replication/doc.go b/replication/doc.go new file mode 100644 index 0000000..6236124 --- /dev/null +++ b/replication/doc.go @@ -0,0 +1,159 @@ +/* +Package replication implements the Chotki distributed synchronization protocol. + +# Protocol Overview + +The Chotki replication protocol enables peer-to-peer synchronization of CRDT-based +databases between replicas without a master server or consensus algorithm. The protocol +maintains causal consistency and supports efficient differential synchronization. + +# Synchronization States + +The protocol operates through a finite state machine with the following states: + + SendHandshake → SendDiff → SendLive → SendEOF → SendNone + ↓ ↕ + SendPing ← → SendPong + +State descriptions: + - SendHandshake: Initial connection setup and capability negotiation + - SendDiff: Bulk synchronization of historical data differences + - SendLive: Real-time streaming of new changes as they occur + - SendPing/SendPong: Keep-alive mechanism during live sync + - SendEOF: Graceful connection termination + - SendNone: Connection closed state + +# Synchronization Modes + +The protocol supports different synchronization modes via bitmask flags: + + SyncRead (1): Can receive data from peer + SyncWrite (2): Can send data to peer + SyncLive (4): Supports real-time live synchronization + +Common combinations: + - SyncRW = SyncRead | SyncWrite (bidirectional batch sync) + - SyncRL = SyncRead | SyncLive (read + live updates) + - SyncRWLive = SyncRead | SyncWrite | SyncLive (full bidirectional) + +# Protocol Flow + +## 1. Handshake Phase + +The synchronization begins with a handshake message: + + H(T{snapshot_id} M{mode} V{version_vector} S{trace_id}) + +Where: + - H: Handshake packet type + - T: Snapshot timestamp ID (current replica state) + - M: Sync mode bitmask (read/write/live capabilities) + - V: Version vector (TLV-encoded replica states) + - S: Session trace ID (for logging and debugging) + +The handshake establishes: + - Peer capabilities and sync mode + - Version vectors for differential calculation + - Session trace IDs for request tracking + +## 2. Diffsync Phase + +After handshake, the protocol sends block-based diffs: + + D(T{snapshot_id} R{block_range} F{offset}+) + +Where: + - D: Diff packet type + - T: Reference snapshot ID + - R: Block range being synchronized + - F: Frame offset within block followed by operation data + +The diff phase: + - Compares version vectors to identify missing data + - Sends data in blocks with maximum size of 100MB (MaxParcelSize) + - Uses CRDT-specific diff algorithms (rdx.Xdiff) for efficiency + - Processes blocks sequentially until all differences are sent + +## 3. Version Vector Exchange + +After diff completion, version vectors are exchanged: + + V(R{block_range} version_vector_data) + +This finalizes the differential sync by confirming the new state. + +## 4. Live Sync Phase (Optional) + +If SyncLive mode is enabled, the protocol enters real-time sync: + - Streams new operations as they occur via the outbound queue + - Maintains connection with periodic ping/pong messages + - Ping interval and timeout are configurable per connection + +Ping/Pong messages: + + P("ping") - Keep-alive ping + P("pong") - Keep-alive response + +## 5. Termination + +Connections end with a bye message containing the reason: + + B(T{final_snapshot_id} reason_text) + +# Versioning + +Key component of the protocol is the version vector. Each time we change data on any replica, this change +is stamped by an rdx.ID (autoincrementing sequence number + source replica id). This rdx.ID will then help us to +determine which things we need to sync. + +Version vector is maintained in VKey0, so every time we see change from the same replica or from other replica, +we update VKey0 +Versions vector or (VV) layout is basically a map: src_id -> last seen rdx.ID. So, we always no which latest event we saw from each replica. + +There is a special src_id = 0, which is a bunch of objects created in Log0, so we could have some common system objects +on each replica. So when those objects are updated they we also store them like: 0 -> last seen rdx.ID in the VV. + +# Handshake +This is the first thing happens when we connect to a new replica. We init the sync session. +But most importantly we send our VV (also we send trace_id for logging) and we also create pebble snapshot, that +we will use during diffsync. When we just connected, we start to accumulate live events, until we completed diffsync. +We will apply them later, while we do not guarantee that created snapshot do not contain any live events from the queue, +we do not care as all operations in chotki are idempotent. + +# Diff sync +As described above, when replicas first connect after handshake they enter diff sync state. The idea is that +before we start live sync, we need to equalize the state between 2 snapshots we made on previous step. +In diff sync we sync data using blocks. Block is basically a range of rdx.ID, so when we update our VV, +we then take this rdx.ID and apply SyncBlockMask to it (basically cutting first N bits from it) and consider it a block. +Then for each block we will also maintain a VV for all updates associated with this block, meaning if DB update occurs, +we update 2 VVs: +- global VV (VKey0) +- block VV (VKey(block_id)) + +The algorithm of diff sync is as follows: + +1. We take a block and look at its VV, then we look at other replica global VV +2. If other replica has not seen something from this block, we start syncing of this block +3. We start iterating all objects inside this block (with rdx.IDs that conform to this block range) +4. If other replica has not seen this object at all (meaning its VV for the src of this object is non existent or smaller than this object rdx.ID), then we just send this object +5. Ohterwise, we use XDiff, however in a majority of cases it will just send whole object anyway for simplicity. It may change in future. +6. When we sync all blocks we then exchange VVs of blocks we synced, so other replica can update them. +7. Each replica accumulate updates in pebble.Batch, and when it receives V packet in the end, it applies it to the DB.\ + +Important note that during diff sync we also broadcast all 'D' and 'H' packets to all other replicas. +Imagine there are 3 replicas: A <-> B <-> C. +- Let's say A, B are live syncing and C is just connected to B. +- If we do not broadcast 'D' packets, then B and C will sync, however if there is something in C, that A haven't seen it will not be synced, until diff sync between A and B. +- But if we broadcast 'D' and 'H' packets, we basically open syncing session between all upstream replicas and C, so +they will sync all the data from C + +Unfortunattely, it can create a lot of excess traffic, especially when we rollout new version and there are a lot of diffsyncs happenings. +But it simplified protocol a lot. + +# Live sync + +After diff sync, we now can process all updates that were accumulated in the queue and continue process them as we go. +When we receive a bunch of records during live sync, we apply them to the DB immediately and also broadcast then to all other replicas. +Due to the fact that currently chotki only allows tree replication structures, we know that we can safely send events to all connected replicas without fear of loops. +*/ +package replication diff --git a/op.go b/replication/op.go similarity index 86% rename from op.go rename to replication/op.go index aaf3d46..07dab50 100644 --- a/op.go +++ b/replication/op.go @@ -1,6 +1,7 @@ -package chotki +package replication import ( + "github.com/drpcorg/chotki/chotki_errors" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" ) @@ -40,12 +41,12 @@ func ParseHandshake(body []byte) (mode SyncMode, vv rdx.VV, trace_id []byte, err rest := body mbody, rest = protocol.Take('M', rest) if mbody == nil { - return 0, nil, nil, ErrBadHPacket + return 0, nil, nil, chotki_errors.ErrBadHPacket } vbody, rest = protocol.Take('V', rest) if vbody == nil { - return 0, nil, nil, ErrBadHPacket + return 0, nil, nil, chotki_errors.ErrBadHPacket } vv = make(rdx.VV) @@ -59,7 +60,7 @@ func ParseHandshake(body []byte) (mode SyncMode, vv rdx.VV, trace_id []byte, err trace_id, _ = protocol.Take('S', rest) if trace_id == nil { - return 0, nil, nil, ErrBadHPacket + return 0, nil, nil, chotki_errors.ErrBadHPacket } return mode, vv, trace_id, nil } diff --git a/sync.go b/replication/sync.go similarity index 86% rename from sync.go rename to replication/sync.go index 7d6252d..039bd6c 100644 --- a/sync.go +++ b/replication/sync.go @@ -1,4 +1,4 @@ -package chotki +package replication import ( "bytes" @@ -14,6 +14,8 @@ import ( "time" "github.com/cockroachdb/pebble" + "github.com/drpcorg/chotki/chotki_errors" + "github.com/drpcorg/chotki/host" "github.com/drpcorg/chotki/protocol" "github.com/drpcorg/chotki/rdx" "github.com/drpcorg/chotki/utils" @@ -21,8 +23,6 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -const SyncBlockBits = 28 -const SyncBlockMask = uint64((1 << SyncBlockBits) - 1) const MaxParcelSize = 100_000_000 var version string = fmt.Sprintf("%d", time.Now().Unix()) @@ -115,13 +115,13 @@ type Syncer struct { PingWait time.Duration WaitUntilNone time.Duration - log utils.Logger + Log utils.Logger vvit, ffit *pebble.Iterator snap pebble.Reader snaplast rdx.ID feedState SyncState drainState SyncState - oqueue protocol.FeedCloser + Oqueue protocol.FeedCloser hostvv, peervv rdx.VV vpack []byte @@ -140,7 +140,7 @@ func (sync *Syncer) withDefaultArgs(reset bool) context.Context { lctx := sync.lctx.Load() if lctx == nil || reset { - nlctx := sync.log.WithDefaultArgs(context.Background(), "name", sync.Name, "trace_id", sync.GetTraceId()) + nlctx := sync.Log.WithDefaultArgs(context.Background(), "name", sync.Name, "trace_id", sync.GetTraceId()) if !reset { sync.lctx.CompareAndSwap(lctx, &nlctx) } else { @@ -151,8 +151,8 @@ func (sync *Syncer) withDefaultArgs(reset bool) context.Context { return *lctx } -func (sync *Syncer) logCtx(ctx context.Context) context.Context { - return sync.log.WithArgsFromCtx(ctx, sync.withDefaultArgs(false)) +func (sync *Syncer) LogCtx(ctx context.Context) context.Context { + return sync.Log.WithArgsFromCtx(ctx, sync.withDefaultArgs(false)) } func (sync *Syncer) Close() error { @@ -167,7 +167,7 @@ func (sync *Syncer) Close() error { if sync.snap != nil { if err := sync.snap.Close(); err != nil { - sync.log.ErrorCtx(sync.logCtx(context.Background()), "failed closing snapshot", "err", err.Error()) + sync.Log.ErrorCtx(sync.LogCtx(context.Background()), "failed closing snapshot", "err", err.Error()) } else { OpenedSnapshots.WithLabelValues(sync.Name, version).Set(0) } @@ -179,7 +179,7 @@ func (sync *Syncer) Close() error { if sync.ffit != nil { if err := sync.ffit.Close(); err != nil { closediterators = false - sync.log.ErrorCtx(sync.logCtx(context.Background()), "failed closing ffit", "err", err) + sync.Log.ErrorCtx(sync.LogCtx(context.Background()), "failed closing ffit", "err", err) } sync.ffit = nil } @@ -187,7 +187,7 @@ func (sync *Syncer) Close() error { if sync.vvit != nil { if err := sync.vvit.Close(); err != nil { closediterators = false - sync.log.ErrorCtx(sync.logCtx(context.Background()), "failed closing vvit", "err", err) + sync.Log.ErrorCtx(sync.LogCtx(context.Background()), "failed closing vvit", "err", err) } sync.vvit = nil } @@ -195,7 +195,7 @@ func (sync *Syncer) Close() error { OpenedIterators.WithLabelValues(sync.Name, version).Set(0) } - sync.log.InfoCtx(sync.logCtx(context.Background()), fmt.Sprintf("sync: connection %s closed: %v\n", sync.Name, sync.reason)) + sync.Log.InfoCtx(sync.LogCtx(context.Background()), fmt.Sprintf("sync: connection %s closed: %v\n", sync.Name, sync.reason)) return nil } @@ -239,7 +239,7 @@ func (sync *Syncer) Feed(ctx context.Context) (recs protocol.Records, err error) defer cancel() select { case <-time.After(sync.PingWait): - sync.log.ErrorCtx(sync.logCtx(ctx), "sync: handshake took too long") + sync.Log.ErrorCtx(sync.LogCtx(ctx), "sync: handshake took too long") sync.SetFeedState(ctx, SendEOF) return case <-sync.WaitDrainState(ctx, SendDiff): @@ -258,7 +258,7 @@ func (sync *Syncer) Feed(ctx context.Context) (recs protocol.Records, err error) if sync.snap != nil { err = sync.snap.Close() if err != nil { - sync.log.ErrorCtx(sync.logCtx(ctx), "sync: failed closing snapshot", "err", err) + sync.Log.ErrorCtx(sync.LogCtx(ctx), "sync: failed closing snapshot", "err", err) } else { OpenedSnapshots.WithLabelValues(sync.Name, version).Set(0) } @@ -274,7 +274,7 @@ func (sync *Syncer) Feed(ctx context.Context) (recs protocol.Records, err error) sync.pingTimer.Stop() sync.pingTimer = time.AfterFunc(sync.PingWait, func() { sync.pingStage.Store(int32(PingBroken)) - sync.log.ErrorCtx(sync.logCtx(ctx), "sync: peer did not respond to ping") + sync.Log.ErrorCtx(sync.LogCtx(ctx), "sync: peer did not respond to ping") }) sync.pingStage.Store(int32(WaitingForPing)) case SendPong: @@ -284,9 +284,9 @@ func (sync *Syncer) Feed(ctx context.Context) (recs protocol.Records, err error) sync.pingStage.Store(int32(Inactive)) sync.SetFeedState(ctx, SendLive) case SendLive: - recs, err = sync.oqueue.Feed(ctx) + recs, err = sync.Oqueue.Feed(ctx) if err == utils.ErrClosed { - sync.log.InfoCtx(sync.logCtx(ctx), "sync: queue closed") + sync.Log.InfoCtx(sync.LogCtx(ctx), "sync: queue closed") sync.SetFeedState(ctx, SendEOF) err = nil } @@ -304,7 +304,7 @@ func (sync *Syncer) Feed(ctx context.Context) (recs protocol.Records, err error) if sync.snap != nil { err = sync.snap.Close() if err != nil { - sync.log.ErrorCtx(sync.logCtx(ctx), "sync: failed closing snapshot", "error", err.Error()) + sync.Log.ErrorCtx(sync.LogCtx(ctx), "sync: failed closing snapshot", "error", err.Error()) } else { OpenedSnapshots.WithLabelValues(sync.Name, version).Set(0) } @@ -343,8 +343,8 @@ func (sync *Syncer) FeedHandshake() (vv protocol.Records, err error) { OpenedIterators.WithLabelValues(sync.Name, version).Set(1) - ok := sync.vvit.SeekGE(VKey0) - if !ok || 0 != bytes.Compare(sync.vvit.Key(), VKey0) { + ok := sync.vvit.SeekGE(host.VKey0) + if !ok || 0 != bytes.Compare(sync.vvit.Key(), host.VKey0) { return nil, rdx.ErrBadV0Record } sync.hostvv = make(rdx.VV) @@ -402,10 +402,10 @@ func (sync *Syncer) getVVChanges() (hasChanges bool, sendvv rdx.VV, err error) { func (sync *Syncer) nextBlockDiff() (bool, rdx.VV, error) { if sync.ffit != nil { - block := VKeyId(sync.vvit.Key()).ZeroOff() - till := block.ProPlus(SyncBlockMask + 1) + block := host.VKeyId(sync.vvit.Key()).ZeroOff() + till := block.ProPlus(host.SyncBlockMask + 1) if sync.ffit.Valid() { - id, _ := OKeyIdRdt(sync.ffit.Key()) + id, _ := host.OKeyIdRdt(sync.ffit.Key()) if id != rdx.BadId && id.Less(till) { _, sendvv, err := sync.getVVChanges() if err != nil { @@ -426,8 +426,8 @@ func (sync *Syncer) nextBlockDiff() (bool, rdx.VV, error) { return false, nil, nil } - block := VKeyId(sync.vvit.Key()).ZeroOff() - key := OKey(block, 0) + block := host.VKeyId(sync.vvit.Key()).ZeroOff() + key := host.OKey(block, 0) sync.ffit.SeekGE(key) return true, sendvv, nil } @@ -441,13 +441,13 @@ func (sync *Syncer) FeedBlockDiff(ctx context.Context) (diff protocol.Records, e return protocol.Records{}, nil } - block := VKeyId(sync.vvit.Key()).ZeroOff() + block := host.VKeyId(sync.vvit.Key()).ZeroOff() bmark, parcel := protocol.OpenHeader(nil, 'D') parcel = append(parcel, protocol.Record('T', sync.snaplast.ZipBytes())...) parcel = append(parcel, protocol.Record('R', block.ZipBytes())...) - till := block.ProPlus(SyncBlockMask + 1) + till := block.ProPlus(host.SyncBlockMask + 1) for ; sync.ffit.Valid(); sync.ffit.Next() { - id, rdt := OKeyIdRdt(sync.ffit.Key()) + id, rdt := host.OKeyIdRdt(sync.ffit.Key()) if id == rdx.BadId || till.Less(id) { break } @@ -460,7 +460,7 @@ func (sync *Syncer) FeedBlockDiff(ctx context.Context) (diff protocol.Records, e val := sync.ffit.Value() parcel = append(parcel, protocol.Record(rdt, val)...) if len(val) > MaxParcelSize { - sync.log.WarnCtx(sync.logCtx(ctx), "too big key size", "size", len(val)) + sync.Log.WarnCtx(sync.LogCtx(ctx), "too big key size", "size", len(val)) } continue } @@ -469,7 +469,7 @@ func (sync *Syncer) FeedBlockDiff(ctx context.Context) (diff protocol.Records, e parcel = append(parcel, protocol.Record('F', rdx.ZipUint64(uint64(id.Pro()-block.Pro())))...) parcel = append(parcel, protocol.Record(rdt, diff)...) if len(diff) > MaxParcelSize { - sync.log.WarnCtx(sync.logCtx(ctx), "too big diff size", "size", len(diff)) + sync.Log.WarnCtx(sync.LogCtx(ctx), "too big diff size", "size", len(diff)) } } } @@ -490,7 +490,7 @@ func (sync *Syncer) FeedDiffVV(ctx context.Context) (vv protocol.Records, err er err = sync.ffit.Close() if err != nil { closediterators = false - sync.log.ErrorCtx(sync.logCtx(ctx), "failed closing ffit", "err", err) + sync.Log.ErrorCtx(sync.LogCtx(ctx), "failed closing ffit", "err", err) } sync.ffit = nil } @@ -498,7 +498,7 @@ func (sync *Syncer) FeedDiffVV(ctx context.Context) (vv protocol.Records, err er err = sync.vvit.Close() if err != nil { closediterators = false - sync.log.ErrorCtx(sync.logCtx(ctx), "failed closing vvit", "err", err) + sync.Log.ErrorCtx(sync.LogCtx(ctx), "failed closing vvit", "err", err) } sync.vvit = nil } @@ -510,14 +510,14 @@ func (sync *Syncer) FeedDiffVV(ctx context.Context) (vv protocol.Records, err er func (sync *Syncer) SetFeedState(ctx context.Context, state SyncState) { SessionsStates.WithLabelValues(sync.Name, "feed", version).Set(float64(state)) - sync.log.InfoCtx(sync.logCtx(ctx), "sync: feed state", "state", state.String()) + sync.Log.InfoCtx(sync.LogCtx(ctx), "sync: feed state", "state", state.String()) sync.lock.Lock() sync.feedState = state sync.lock.Unlock() } func (sync *Syncer) SetDrainState(ctx context.Context, state SyncState) { - sync.log.InfoCtx(sync.logCtx(ctx), "sync: drain state", "state", state.String()) + sync.Log.InfoCtx(sync.LogCtx(ctx), "sync: drain state", "state", state.String()) SessionsStates.WithLabelValues(sync.Name, "drain", version).Set(float64(state)) sync.lock.Lock() sync.drainState = state @@ -589,11 +589,11 @@ func (sync *Syncer) processPings(recs protocol.Records) protocol.Records { } switch rdx.Snative(body) { case PingVal: - sync.log.InfoCtx(sync.logCtx(context.Background()), "ping received") + sync.Log.InfoCtx(sync.LogCtx(context.Background()), "ping received") // go to pong state next time sync.pingStage.Store(int32(Pong)) case PongVal: - sync.log.InfoCtx(sync.logCtx(context.Background()), "pong received") + sync.Log.InfoCtx(sync.LogCtx(context.Background()), "pong received") } } } @@ -611,16 +611,16 @@ func (sync *Syncer) Drain(ctx context.Context, recs protocol.Records) (err error switch sync.GetDrainState() { case SendHandshake: if len(recs) == 0 { - return ErrBadHPacket + return chotki_errors.ErrBadHPacket } err = sync.DrainHandshake(recs[0:1]) if err == nil { - err = sync.Host.Drain(sync.logCtx(ctx), recs[0:1]) + err = sync.Host.Drain(sync.LogCtx(ctx), recs[0:1]) } if err != nil { return } - sync.Host.Broadcast(sync.logCtx(ctx), recs[0:1], sync.Name) + sync.Host.Broadcast(sync.LogCtx(ctx), recs[0:1], sync.Name) recs = recs[1:] sync.SetDrainState(ctx, SendDiff) if len(recs) == 0 { @@ -642,9 +642,9 @@ func (sync *Syncer) Drain(ctx context.Context, recs protocol.Records) (err error if sync.Mode&SyncLive != 0 { sync.resetPingTimer() } - err = sync.Host.Drain(sync.logCtx(ctx), recs) + err = sync.Host.Drain(sync.LogCtx(ctx), recs) if err == nil && broadcast { - sync.Host.Broadcast(sync.logCtx(ctx), recs, sync.Name) + sync.Host.Broadcast(sync.LogCtx(ctx), recs, sync.Name) } case SendLive: @@ -653,23 +653,23 @@ func (sync *Syncer) Drain(ctx context.Context, recs protocol.Records) (err error if lit == 'B' { sync.SetDrainState(ctx, SendNone) } - err = sync.Host.Drain(sync.logCtx(ctx), recs) + err = sync.Host.Drain(sync.LogCtx(ctx), recs) if err == nil && lit != 'B' { - sync.Host.Broadcast(sync.logCtx(ctx), recs, sync.Name) + sync.Host.Broadcast(sync.LogCtx(ctx), recs, sync.Name) } case SendPong, SendPing: panic("chotki: unacceptable sync-state") case SendEOF, SendNone: - return ErrClosed + return chotki_errors.ErrClosed default: panic("chotki: unacceptable sync-state") } if err != nil { // todo send the error msg - sync.log.ErrorCtx(sync.logCtx(ctx), "error happened while drain", "error", err.Error()) + sync.Log.ErrorCtx(sync.LogCtx(ctx), "error happened while drain", "error", err.Error()) sync.SetDrainState(ctx, SendEOF) } @@ -697,7 +697,7 @@ func (sync *Syncer) GetTraceId() string { func (sync *Syncer) DrainHandshake(recs protocol.Records) (err error) { lit, _, _, body, e := ParsePacket(recs[0]) if lit != 'H' || e != nil { - return ErrBadHPacket + return chotki_errors.ErrBadHPacket } var mode SyncMode var trace_id []byte @@ -705,7 +705,7 @@ func (sync *Syncer) DrainHandshake(recs protocol.Records) (err error) { sync.lock.Lock() if trace_id != nil { if len(trace_id) != TraceSize { - err = ErrBadHPacket + err = chotki_errors.ErrBadHPacket } else { traceId := [TraceSize]byte(trace_id) sync.theirsTraceid.Store(&traceId) diff --git a/test_utils/sync.go b/test_utils/sync.go new file mode 100644 index 0000000..61f21c6 --- /dev/null +++ b/test_utils/sync.go @@ -0,0 +1,43 @@ +package testutils + +import ( + "log/slog" + "time" + + "github.com/drpcorg/chotki/host" + "github.com/drpcorg/chotki/protocol" + "github.com/drpcorg/chotki/replication" + "github.com/drpcorg/chotki/utils" +) + +func SyncData(a, b host.Host) error { + synca := replication.Syncer{ + Host: a, + Mode: replication.SyncRW, + Name: "a", + WaitUntilNone: time.Millisecond, + Src: a.Source(), + Log: utils.NewDefaultLogger(slog.LevelError), + PingWait: time.Second, + } + syncb := replication.Syncer{ + Host: b, + Mode: replication.SyncRW, + WaitUntilNone: time.Millisecond, + Name: "b", + Src: b.Source(), + Log: utils.NewDefaultLogger(slog.LevelError), + PingWait: time.Second, + } + defer syncb.Close() + defer synca.Close() + // send handshake from b to a + err := protocol.Relay(&syncb, &synca) + if err != nil { + return err + } + go protocol.Pump(&syncb, &synca) + // send data a -> b + return protocol.Pump(&synca, &syncb) + +}