From 1ca166ff48d885fb3150ef88ebe83b122c885f27 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Nov 2023 08:19:15 +0000 Subject: [PATCH] Bump github.com/nats-io/nats-server/v2 from 2.10.4 to 2.10.5 Bumps [github.com/nats-io/nats-server/v2](https://github.com/nats-io/nats-server) from 2.10.4 to 2.10.5. - [Release notes](https://github.com/nats-io/nats-server/releases) - [Changelog](https://github.com/nats-io/nats-server/blob/main/.goreleaser.yml) - [Commits](https://github.com/nats-io/nats-server/compare/v2.10.4...v2.10.5) --- updated-dependencies: - dependency-name: github.com/nats-io/nats-server/v2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- go.mod | 6 +- go.sum | 11 +- .../nats-io/jwt/v2/operator_claims.go | 6 +- .../nats-io/nats-server/v2/server/auth.go | 2 +- .../nats-io/nats-server/v2/server/client.go | 47 +- .../nats-io/nats-server/v2/server/const.go | 2 +- .../nats-io/nats-server/v2/server/consumer.go | 5 +- .../nats-io/nats-server/v2/server/events.go | 20 +- .../nats-server/v2/server/filestore.go | 520 +++++++++++------- .../nats-server/v2/server/jetstream.go | 5 + .../v2/server/jetstream_cluster.go | 80 +-- .../nats-io/nats-server/v2/server/leafnode.go | 11 +- .../nats-io/nats-server/v2/server/memstore.go | 4 +- .../nats-io/nats-server/v2/server/mqtt.go | 464 ++++++++++------ .../nats-io/nats-server/v2/server/raft.go | 435 +++++++++------ .../nats-io/nats-server/v2/server/stream.go | 101 ++-- .../nats-server/v2/server/websocket.go | 4 +- vendor/modules.txt | 8 +- 18 files changed, 1066 insertions(+), 665 deletions(-) diff --git a/go.mod b/go.mod index d31dc81a4a..daeb87be15 100644 --- a/go.mod +++ b/go.mod @@ -60,7 +60,7 @@ require ( github.com/mitchellh/mapstructure v1.5.0 github.com/mna/pigeon v1.2.1 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 - github.com/nats-io/nats-server/v2 v2.10.4 + github.com/nats-io/nats-server/v2 v2.10.5 github.com/oklog/run v1.1.0 github.com/olekukonko/tablewriter v0.0.5 github.com/onsi/ginkgo v1.16.5 @@ -272,7 +272,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mschoch/smat v0.2.0 // indirect - github.com/nats-io/jwt/v2 v2.5.2 // indirect + github.com/nats-io/jwt/v2 v2.5.3 // indirect github.com/nats-io/nats.go v1.31.0 // indirect github.com/nats-io/nkeys v0.4.6 // indirect github.com/nats-io/nuid v1.0.1 // indirect @@ -331,7 +331,7 @@ require ( golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect golang.org/x/mod v0.13.0 // indirect golang.org/x/sys v0.14.0 // indirect - golang.org/x/time v0.3.0 // indirect + golang.org/x/time v0.4.0 // indirect golang.org/x/tools v0.14.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.8 // indirect diff --git a/go.sum b/go.sum index be5bdb86bf..ab5bb319cd 100644 --- a/go.sum +++ b/go.sum @@ -1739,10 +1739,10 @@ github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOl github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/namedotcom/go v0.0.0-20180403034216-08470befbe04/go.mod h1:5sN+Lt1CaY4wsPvgQH/jsuJi4XO2ssZbdsIizr4CVC8= -github.com/nats-io/jwt/v2 v2.5.2 h1:DhGH+nKt+wIkDxM6qnVSKjokq5t59AZV5HRcFW0zJwU= -github.com/nats-io/jwt/v2 v2.5.2/go.mod h1:24BeQtRwxRV8ruvC4CojXlx/WQ/VjuwlYiH+vu/+ibI= -github.com/nats-io/nats-server/v2 v2.10.4 h1:uB9xcwon3tPXWAdmTJqqqC6cie3yuPWHJjjTBgaPNus= -github.com/nats-io/nats-server/v2 v2.10.4/go.mod h1:eWm2JmHP9Lqm2oemB6/XGi0/GwsZwtWf8HIPUsh+9ns= +github.com/nats-io/jwt/v2 v2.5.3 h1:/9SWvzc6hTfamcgXJ3uYRpgj+QuY2aLNqRiqrKcrpEo= +github.com/nats-io/jwt/v2 v2.5.3/go.mod h1:iysuPemFcc7p4IoYots3IuELSI4EDe9Y0bQMe+I3Bf4= +github.com/nats-io/nats-server/v2 v2.10.5 h1:hhWt6m9ja/mNnm6ixc85jCthDaiUFPaeJI79K/MD980= +github.com/nats-io/nats-server/v2 v2.10.5/go.mod h1:xUMTU4kS//SDkJCSvFwN9SyJ9nUuLhSkzB/Qz0dvjjg= github.com/nats-io/nats.go v1.31.0 h1:/WFBHEc/dOKBF6qf1TZhrdEfTmOZ5JzdJ+Y3m6Y/p7E= github.com/nats-io/nats.go v1.31.0/go.mod h1:di3Bm5MLsoB4Bx61CBTsxuarI36WbhAwOm8QrW39+i8= github.com/nats-io/nkeys v0.4.6 h1:IzVe95ru2CT6ta874rt9saQRkWfe2nFj1NtvYSLqMzY= @@ -2548,8 +2548,9 @@ golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20220922220347-f3bd1da661af/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.1.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.4.0 h1:Z81tqI5ddIoXDPvVQ7/7CC9TnLM7ubaFG2qXYd5BbYY= +golang.org/x/time v0.4.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/vendor/github.com/nats-io/jwt/v2/operator_claims.go b/vendor/github.com/nats-io/jwt/v2/operator_claims.go index 3835b973e8..673225fa82 100644 --- a/vendor/github.com/nats-io/jwt/v2/operator_claims.go +++ b/vendor/github.com/nats-io/jwt/v2/operator_claims.go @@ -136,8 +136,12 @@ func ValidateOperatorServiceURL(v string) error { return nil case "tls": return nil + case "ws": + return nil + case "wss": + return nil default: - return fmt.Errorf("operator service url %q - protocol not supported (only 'nats' or 'tls' only)", v) + return fmt.Errorf("operator service url %q - protocol not supported (only 'nats', 'tls', 'ws', 'wss' only)", v) } } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/auth.go b/vendor/github.com/nats-io/nats-server/v2/server/auth.go index b8e82abe41..7a0f93e217 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/auth.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/auth.go @@ -525,7 +525,7 @@ func processUserPermissionsTemplate(lim jwt.UserPermissionLimits, ujwt *jwt.User for _, valueList := range nArrayCartesianProduct(tagValues...) { b := strings.Builder{} for i, token := range newTokens { - if token == _EMPTY_ { + if token == _EMPTY_ && len(valueList) > 0 { b.WriteString(valueList[0]) valueList = valueList[1:] } else { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/client.go b/vendor/github.com/nats-io/nats-server/v2/server/client.go index 6aba4395ad..72346e6958 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/client.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/client.go @@ -1,4 +1,4 @@ -// Copyright 2012-2022 The NATS Authors +// Copyright 2012-2023 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -1340,6 +1340,7 @@ func (c *client) readLoop(pre []byte) { c.Errorf("read error: %v", err) } c.closeConnection(closedStateForErr(err)) + return } else if bufs == nil { continue } @@ -1498,15 +1499,6 @@ func (c *client) collapsePtoNB() (net.Buffers, int64) { return c.out.nb, c.out.pb } -// This will handle the fixup needed on a partial write. -// Assume pending has been already calculated correctly. -func (c *client) handlePartialWrite(pnb net.Buffers) { - if c.isWebsocket() { - c.ws.frames = append(pnb, c.ws.frames...) - return - } -} - // flushOutbound will flush outbound buffer to a client. // Will return true if data was attempted to be written. // Lock must be held @@ -1677,12 +1669,6 @@ func (c *client) flushOutbound() bool { c.ws.fs -= n } - // Check for partial writes - // TODO(dlc) - zero write with no error will cause lost message and the writeloop to spin. - if n != attempted && n > 0 { - c.handlePartialWrite(c.out.nb) - } - // Check that if there is still data to send and writeLoop is in wait, // then we need to signal. if c.out.pb > 0 { @@ -2755,7 +2741,7 @@ func (c *client) processSubEx(subject, queue, bsid []byte, cb msgHandler, noForw return sub, nil } - if err := c.addShadowSubscriptions(acc, sub); err != nil { + if err := c.addShadowSubscriptions(acc, sub, true); err != nil { c.Errorf(err.Error()) } @@ -2782,10 +2768,13 @@ type ime struct { dyn bool } -// If the client's account has stream imports and there are matches for -// this subscription's subject, then add shadow subscriptions in the -// other accounts that export this subject. -func (c *client) addShadowSubscriptions(acc *Account, sub *subscription) error { +// If the client's account has stream imports and there are matches for this +// subscription's subject, then add shadow subscriptions in the other accounts +// that export this subject. +// +// enact=false allows MQTT clients to get the list of shadow subscriptions +// without enacting them, in order to first obtain matching "retained" messages. +func (c *client) addShadowSubscriptions(acc *Account, sub *subscription, enact bool) error { if acc == nil { return ErrMissingAccount } @@ -2888,7 +2877,7 @@ func (c *client) addShadowSubscriptions(acc *Account, sub *subscription) error { for i := 0; i < len(ims); i++ { ime := &ims[i] // We will create a shadow subscription. - nsub, err := c.addShadowSub(sub, ime) + nsub, err := c.addShadowSub(sub, ime, enact) if err != nil { return err } @@ -2905,7 +2894,7 @@ func (c *client) addShadowSubscriptions(acc *Account, sub *subscription) error { } // Add in the shadow subscription. -func (c *client) addShadowSub(sub *subscription, ime *ime) (*subscription, error) { +func (c *client) addShadowSub(sub *subscription, ime *ime, enact bool) (*subscription, error) { im := ime.im nsub := *sub // copy nsub.im = im @@ -2929,6 +2918,11 @@ func (c *client) addShadowSub(sub *subscription, ime *ime) (*subscription, error } } // Else use original subject + + if !enact { + return &nsub, nil + } + c.Debugf("Creating import subscription on %q from account %q", nsub.subject, im.acc.Name) if err := im.acc.sl.Insert(&nsub); err != nil { @@ -3298,9 +3292,12 @@ func (c *client) stalledWait(producer *client) { c.mu.Unlock() defer c.mu.Lock() + delay := time.NewTimer(ttl) + defer delay.Stop() + select { case <-stall: - case <-time.After(ttl): + case <-delay.C: producer.Debugf("Timed out of fast producer stall (%v)", ttl) } } @@ -5045,7 +5042,7 @@ func (c *client) processSubsOnConfigReload(awcsti map[string]struct{}) { oldShadows := sub.shadow sub.shadow = nil c.mu.Unlock() - c.addShadowSubscriptions(acc, sub) + c.addShadowSubscriptions(acc, sub, true) for _, nsub := range oldShadows { nsub.im.acc.sl.Remove(nsub) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/const.go b/vendor/github.com/nats-io/nats-server/v2/server/const.go index 603a3ae44d..4295a4304c 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/const.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/const.go @@ -41,7 +41,7 @@ var ( const ( // VERSION is the current version for the server. - VERSION = "2.10.4" + VERSION = "2.10.5" // PROTO is the currently supported protocol. // 0 was the original diff --git a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go index 1f96e772e8..6953abf518 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go @@ -3731,7 +3731,8 @@ func (o *consumer) processInboundAcks(qch chan struct{}) { // How often we will check for ack floor drift. // Spread these out for large numbers on a server restart. delta := time.Duration(rand.Int63n(int64(time.Minute))) - var ackFloorCheck = time.Minute + delta + ticker := time.NewTicker(time.Minute + delta) + defer ticker.Stop() for { select { @@ -3746,7 +3747,7 @@ func (o *consumer) processInboundAcks(qch chan struct{}) { if hasInactiveThresh { o.suppressDeletion() } - case <-time.After(ackFloorCheck): + case <-ticker.C: o.checkAckFloor() case <-qch: return diff --git a/vendor/github.com/nats-io/nats-server/v2/server/events.go b/vendor/github.com/nats-io/nats-server/v2/server/events.go index 601ed85a0d..66d744d451 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/events.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/events.go @@ -810,8 +810,26 @@ func (s *Server) sendStatsz(subj string) { return } + shouldCheckInterest := func() bool { + opts := s.getOpts() + if opts.Cluster.Port != 0 || opts.Gateway.Port != 0 || opts.LeafNode.Port != 0 { + return false + } + // If we are here we have no clustering or gateways and are not a leafnode hub. + // Check for leafnode remotes that connect the system account. + if len(opts.LeafNode.Remotes) > 0 { + sysAcc := s.sys.account.GetName() + for _, r := range opts.LeafNode.Remotes { + if r.LocalAccount == sysAcc { + return false + } + } + } + return true + } + // if we are running standalone, check for interest. - if s.standAloneMode() { + if shouldCheckInterest() { // Check if we even have interest in this subject. sacc := s.sys.account rr := sacc.sl.Match(subj) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go index 3157e6784f..52af94fdb8 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go @@ -32,6 +32,7 @@ import ( "os" "path/filepath" "sort" + "strings" "sync" "sync/atomic" "time" @@ -156,6 +157,7 @@ type psi struct { total uint64 fblk uint32 lblk uint32 + subj string } type fileStore struct { @@ -177,6 +179,7 @@ type fileStore struct { bim map[uint32]*msgBlock psim map[string]*psi tsl int + adml int hh hash.Hash64 qch chan struct{} fch chan struct{} @@ -184,6 +187,7 @@ type fileStore struct { cfs []ConsumerStore sips int dirty int + closing bool closed bool fip bool receivedAny bool @@ -272,10 +276,6 @@ const ( newScan = "%d.new" // used to scan index file names. indexScan = "%d.idx" - // to look for orphans - indexScanAll = "*.idx" - // to look for orphans - fssScanAll = "*.fss" // used to store our block encryption key. keyScan = "%d.key" // to look for orphans @@ -456,10 +456,7 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim // Also make sure we get rid of old idx and fss files on return. // Do this in separate go routine vs inline and at end of processing. defer func() { - go func() { - os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, indexScanAll)) - os.RemoveAll(filepath.Join(fs.fcfg.StoreDir, msgDir, fssScanAll)) - }() + go fs.cleanupOldMeta() }() // Lock while do enforcements and removals. @@ -524,7 +521,8 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim } } - fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) + // Setup our sync timer. + fs.setSyncTimer() // Spin up the go routine that will write out or full state stream index. go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld) @@ -941,10 +939,10 @@ func (mb *msgBlock) ensureLastChecksumLoaded() { // Perform a recover but do not update PSIM. // Lock should be held. func (fs *fileStore) recoverMsgBlockNoSubjectUpdates(index uint32) (*msgBlock, error) { - psim := fs.psim + psim, tsl := fs.psim, fs.tsl fs.psim = nil mb, err := fs.recoverMsgBlock(index) - fs.psim = psim + fs.psim, fs.tsl = psim, tsl return mb, err } @@ -1098,11 +1096,12 @@ func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) { mb.mu.RLock() fs.state.Msgs += mb.msgs fs.state.Bytes += mb.bytes - if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq { - fs.state.FirstSeq = mb.first.seq + fseq := atomic.LoadUint64(&mb.first.seq) + if fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { + fs.state.FirstSeq = fseq fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() } - fs.state.LastSeq = mb.last.seq + fs.state.LastSeq = atomic.LoadUint64(&mb.last.seq) fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() mb.mu.RUnlock() } @@ -1223,7 +1222,7 @@ func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) { // Rebuild the state of the blk based on what we have on disk in the N.blk file. // Lock should be held. func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { - startLastSeq := mb.last.seq + startLastSeq := atomic.LoadUint64(&mb.last.seq) // Remove the .fss file and clear any cache we have set. mb.clearCacheAndOffset() @@ -1237,7 +1236,8 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { if mb.msgs > 0 { // We need to declare lost data here. ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes} - for seq := mb.first.seq; seq <= mb.last.seq; seq++ { + firstSeq, lastSeq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) + for seq := firstSeq; seq <= lastSeq; seq++ { if !mb.dmap.Exists(seq) { ld.Msgs = append(ld.Msgs, seq) } @@ -1245,14 +1245,15 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { // Clear invalid state. We will let this blk be added in here. mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil mb.dmap.Empty() - mb.first.seq = mb.last.seq + 1 + atomic.StoreUint64(&mb.first.seq, atomic.LoadUint64(&mb.last.seq)+1) } return ld, nil, err } // Clear state we need to rebuild. mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil - mb.last.seq, mb.last.ts = 0, 0 + atomic.StoreUint64(&mb.last.seq, 0) + mb.last.ts = 0 firstNeedsSet := true // Check if we need to decrypt. @@ -1307,7 +1308,7 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { gatherLost := func(lb uint32) *LostStreamData { var ld LostStreamData - for seq := mb.last.seq + 1; seq <= startLastSeq; seq++ { + for seq := atomic.LoadUint64(&mb.last.seq) + 1; seq <= startLastSeq; seq++ { ld.Msgs = append(ld.Msgs, seq) } ld.Bytes = uint64(lb) @@ -1375,18 +1376,20 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { continue } + fseq := atomic.LoadUint64(&mb.first.seq) // This is an old erased message, or a new one that we can track. - if seq == 0 || seq&ebit != 0 || seq < mb.first.seq { + if seq == 0 || seq&ebit != 0 || seq < fseq { seq = seq &^ ebit - if seq >= mb.first.seq { + if seq >= fseq { // Only add to dmap if past recorded first seq and non-zero. if seq != 0 { addToDmap(seq) } - mb.last.seq = seq + atomic.StoreUint64(&mb.last.seq, seq) mb.last.ts = ts if mb.msgs == 0 { - mb.first.seq, mb.first.ts = seq+1, 0 + atomic.StoreUint64(&mb.first.seq, seq+1) + mb.first.ts = 0 } } index += rl @@ -1396,8 +1399,9 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { // This is for when we have index info that adjusts for deleted messages // at the head. So the first.seq will be already set here. If this is larger // replace what we have with this seq. - if firstNeedsSet && seq >= mb.first.seq { - firstNeedsSet, mb.first.seq, mb.first.ts = false, seq, ts + if firstNeedsSet && seq >= fseq { + atomic.StoreUint64(&mb.first.seq, seq) + firstNeedsSet, mb.first.ts = false, ts } if !mb.dmap.Exists(seq) { @@ -1423,7 +1427,7 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { } // Always set last - mb.last.seq = seq + atomic.StoreUint64(&mb.last.seq, seq) mb.last.ts = ts // Advance to next record. @@ -1434,12 +1438,15 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { // Or if we seem to have no messages but had a tombstone, which we use to remember // sequences and timestamps now, use that to properly setup the first and last. if mb.msgs == 0 { - if mb.first.seq > 0 { - mb.last.seq = mb.first.seq - 1 - } else if mb.first.seq == 0 && minTombstoneSeq > 0 { - mb.first.seq, mb.first.ts = minTombstoneSeq+1, 0 + fseq := atomic.LoadUint64(&mb.first.seq) + if fseq > 0 { + atomic.StoreUint64(&mb.last.seq, fseq-1) + } else if fseq == 0 && minTombstoneSeq > 0 { + atomic.StoreUint64(&mb.first.seq, minTombstoneSeq+1) + mb.first.ts = 0 if mb.last.seq == 0 { - mb.last.seq, mb.last.ts = minTombstoneSeq, minTombstoneTs + atomic.StoreUint64(&mb.last.seq, minTombstoneSeq) + mb.last.ts = minTombstoneTs } } } @@ -1574,9 +1581,9 @@ func (fs *fileStore) recoverFullState() (rerr error) { fs.warn("Stream state bad subject len (%d)", lsubj) return errCorruptState } - subj := fs.subjString(buf[bi : bi+lsubj]) + subj := string(buf[bi : bi+lsubj]) bi += lsubj - psi := &psi{total: readU64(), fblk: uint32(readU64())} + psi := &psi{total: readU64(), fblk: uint32(readU64()), subj: subj} if psi.total > 1 { psi.lblk = uint32(readU64()) } else { @@ -1597,7 +1604,9 @@ func (fs *fileStore) recoverFullState() (rerr error) { break } mb := fs.initMsgBlock(index) - mb.first.seq, mb.last.seq, mb.msgs, mb.bytes = fseq, lseq, lseq-fseq+1, nbytes + atomic.StoreUint64(&mb.first.seq, fseq) + atomic.StoreUint64(&mb.last.seq, lseq) + mb.msgs, mb.bytes = lseq-fseq+1, nbytes mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime if numDeleted > 0 { dmap, n, err := avl.Decode(buf[bi:]) @@ -1682,13 +1691,13 @@ func (fs *fileStore) recoverFullState() (rerr error) { return err } if nmb != nil { - // Update top level accounting. - if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq { - fs.state.FirstSeq = nmb.first.seq + // Update top level accounting + if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { + fs.state.FirstSeq = fseq fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() } - if nmb.last.seq > fs.state.LastSeq { - fs.state.LastSeq = nmb.last.seq + if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq { + fs.state.LastSeq = lseq fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() } fs.state.Msgs += nmb.msgs @@ -1714,7 +1723,7 @@ func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) { // triggered limits exceeded will be handled after the recovery and prior to the stream // being available to the system. var smv StoreMsg - for seq := mb.last.seq + 1; seq <= nmb.last.seq; seq++ { + for seq, lseq := atomic.LoadUint64(&mb.last.seq)+1, atomic.LoadUint64(&nmb.last.seq); seq <= lseq; seq++ { // Lookup the message. If an error will be deleted, so can skip. sm, err := nmb.cacheLookup(seq, &smv) if err != nil { @@ -1730,25 +1739,25 @@ func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) { info.lblk = nmb.index } } else { - fs.psim[sm.subj] = &psi{total: 1, fblk: nmb.index, lblk: nmb.index} + fs.psim[sm.subj] = &psi{total: 1, fblk: nmb.index, lblk: nmb.index, subj: sm.subj} fs.tsl += len(sm.subj) } } } // Now check to see if we had a higher first for the recovered state mb vs nmb. - if nmb.first.seq < mb.first.seq { + if atomic.LoadUint64(&nmb.first.seq) < atomic.LoadUint64(&mb.first.seq) { // Now set first for nmb. - nmb.first = mb.first + atomic.StoreUint64(&nmb.first.seq, atomic.LoadUint64(&mb.first.seq)) } // Update top level accounting. - if fs.state.FirstSeq == 0 || nmb.first.seq < fs.state.FirstSeq { - fs.state.FirstSeq = nmb.first.seq + if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { + fs.state.FirstSeq = fseq fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() } - if nmb.last.seq > fs.state.LastSeq { - fs.state.LastSeq = nmb.last.seq + if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq { + fs.state.LastSeq = lseq fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() } } @@ -1792,6 +1801,32 @@ func (mb *msgBlock) lastChecksum() []byte { return lchk[:] } +// This will make sure we clean up old idx and fss files. +func (fs *fileStore) cleanupOldMeta() { + fs.mu.RLock() + mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) + fs.mu.RUnlock() + + f, err := os.Open(mdir) + if err != nil { + return + } + + dirs, _ := f.ReadDir(-1) + f.Close() + + const ( + minLen = 4 + idxSuffix = ".idx" + fssSuffix = ".fss" + ) + for _, fi := range dirs { + if name := fi.Name(); strings.HasSuffix(name, idxSuffix) || strings.HasSuffix(name, fssSuffix) { + os.Remove(filepath.Join(mdir, name)) + } + } +} + func (fs *fileStore) recoverMsgs() error { fs.mu.Lock() defer fs.mu.Unlock() @@ -1836,17 +1871,21 @@ func (fs *fileStore) recoverMsgs() error { fs.removeMsgBlockFromList(mb) continue } - if fs.state.FirstSeq == 0 || mb.first.seq < fs.state.FirstSeq { - fs.state.FirstSeq = mb.first.seq + if fseq := atomic.LoadUint64(&mb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { + fs.state.FirstSeq = fseq if mb.first.ts == 0 { fs.state.FirstTime = time.Time{} } else { fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() } } - if mb.last.seq > fs.state.LastSeq { - fs.state.LastSeq = mb.last.seq - fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() + if lseq := atomic.LoadUint64(&mb.last.seq); lseq > fs.state.LastSeq { + fs.state.LastSeq = lseq + if mb.last.ts == 0 { + fs.state.LastTime = time.Time{} + } else { + fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() + } } fs.state.Msgs += mb.msgs fs.state.Bytes += mb.bytes @@ -1924,7 +1963,8 @@ func (fs *fileStore) expireMsgsOnRecover() { // If we are the last keep state to remember first/last sequence. // Do this part by hand since not deleting one by one. if mb == fs.lmb { - last = mb.last + last.seq = atomic.LoadUint64(&mb.last.seq) + last.ts = mb.last.ts } // Make sure we do subject cleanup as well. mb.ensurePerSubjectInfoLoaded() @@ -1964,7 +2004,8 @@ func (fs *fileStore) expireMsgsOnRecover() { // Walk messages and remove if expired. mb.ensurePerSubjectInfoLoaded() - for seq := mb.first.seq; seq <= mb.last.seq; seq++ { + fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) + for seq := fseq; seq <= lseq; seq++ { sm, err := mb.cacheLookup(seq, &smv) // Process interior deleted msgs. if err == errDeletedMsg { @@ -1973,12 +2014,14 @@ func (fs *fileStore) expireMsgsOnRecover() { mb.dmap.Delete(seq) } // Keep this updated just in case since we are removing dmap entries. - mb.first.seq, needNextFirst = seq, true + atomic.StoreUint64(&mb.first.seq, seq) + needNextFirst = true continue } // Break on other errors. if err != nil || sm == nil { - mb.first.seq, needNextFirst = seq, true + atomic.StoreUint64(&mb.first.seq, seq) + needNextFirst = true break } @@ -1986,16 +2029,17 @@ func (fs *fileStore) expireMsgsOnRecover() { // Check for done. if minAge < sm.ts { - mb.first.seq, needNextFirst = sm.seq, false - mb.first.seq = sm.seq + atomic.StoreUint64(&mb.first.seq, sm.seq) mb.first.ts = sm.ts + needNextFirst = false nts = sm.ts break } // Delete the message here. if mb.msgs > 0 { - mb.first.seq, needNextFirst = seq, true + atomic.StoreUint64(&mb.first.seq, seq) + needNextFirst = true sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) if sz > mb.bytes { sz = mb.bytes @@ -2099,10 +2143,8 @@ func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 { return lastSeq + 1 } - mb.mu.RLock() - fseq := mb.first.seq - lseq := mb.last.seq - mb.mu.RUnlock() + fseq := atomic.LoadUint64(&mb.first.seq) + lseq := atomic.LoadUint64(&mb.last.seq) var smv StoreMsg @@ -2140,7 +2182,12 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor // Skip scan of mb.fss if number of messages in the block are less than // 1/2 the number of subjects in mb.fss. Or we have a wc and lots of fss entries. const linearScanMaxFSS = 32 - doLinearScan := isAll || 2*int(mb.last.seq-start) < len(mb.fss) || (wc && len(mb.fss) > linearScanMaxFSS) + // Make sure to start at mb.first.seq if fseq < mb.first.seq + if seq := atomic.LoadUint64(&mb.first.seq); seq > fseq { + fseq = seq + } + lseq := atomic.LoadUint64(&mb.last.seq) + doLinearScan := isAll || 2*int(lseq-fseq) < len(mb.fss) || (wc && len(mb.fss) > linearScanMaxFSS) if !doLinearScan { // If we have a wildcard match against all tracked subjects we know about. @@ -2152,7 +2199,7 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor } } } - fseq = mb.last.seq + 1 + fseq = lseq + 1 for _, subj := range subs { ss := mb.fss[subj] if ss != nil && ss.firstNeedsUpdate { @@ -2169,7 +2216,7 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor } } - if fseq > mb.last.seq { + if fseq > lseq { return nil, false, ErrStoreMsgNotFound } @@ -2177,13 +2224,13 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor sm = new(StoreMsg) } - for seq := fseq; seq <= mb.last.seq; seq++ { + for seq := fseq; seq <= lseq; seq++ { llseq := mb.llseq fsm, err := mb.cacheLookup(seq, sm) if err != nil { continue } - expireOk := seq == mb.last.seq && mb.llseq == seq + expireOk := seq == lseq && mb.llseq == seq if isAll { return fsm, expireOk, nil } @@ -2221,8 +2268,10 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( // First check if we can optimize this part. // This means we want all and the starting sequence was before this block. - if isAll && sseq <= mb.first.seq { - return mb.msgs, mb.first.seq, mb.last.seq + if isAll { + if fseq := atomic.LoadUint64(&mb.first.seq); sseq <= fseq { + return mb.msgs, fseq, atomic.LoadUint64(&mb.last.seq) + } } update := func(ss *SimpleState) { @@ -2287,7 +2336,7 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( } var smv StoreMsg - for seq := sseq; seq <= mb.last.seq; seq++ { + for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { sm, _ := mb.cacheLookup(seq, &smv) if sm == nil { continue @@ -2554,7 +2603,7 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) mb := fs.blks[i] mb.mu.Lock() var t uint64 - if isAll && sseq <= mb.first.seq { + if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) { if lastPerSubject { mb.ensurePerSubjectInfoLoaded() for subj := range mb.fss { @@ -2607,7 +2656,7 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) shouldExpire = true } var smv StoreMsg - for seq := sseq; seq <= mb.last.seq; seq++ { + for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && (isAll || isMatch(sm.subj)) { t++ } @@ -2679,7 +2728,7 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) var shouldExpire bool mb.mu.Lock() // Check if we should include all of this block in adjusting. If so work with metadata. - if sseq > mb.last.seq { + if sseq > atomic.LoadUint64(&mb.last.seq) { if isAll && !lastPerSubject { adjust += mb.msgs } else { @@ -2707,11 +2756,11 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) shouldExpire = true } - var last = mb.last.seq + var last = atomic.LoadUint64(&mb.last.seq) if sseq < last { last = sseq } - for seq := mb.first.seq; seq < last; seq++ { + for seq := atomic.LoadUint64(&mb.first.seq); seq < last; seq++ { sm, _ := mb.cacheLookup(seq, &smv) if sm == nil { continue @@ -2839,8 +2888,8 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { ts := time.Now().UnixNano() mb.llts, mb.lwts = 0, ts // Remember our last sequence number. - mb.first.seq = fs.state.LastSeq + 1 - mb.last.seq = fs.state.LastSeq + atomic.StoreUint64(&mb.first.seq, fs.state.LastSeq+1) + atomic.StoreUint64(&mb.last.seq, fs.state.LastSeq) mb.mu.Unlock() // Now do local hash. @@ -2938,8 +2987,8 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl { return ErrMaxMsgs } - if fs.cfg.MaxBytes > 0 && fs.state.Bytes+uint64(len(msg)+len(hdr)) >= uint64(fs.cfg.MaxBytes) { - if !asl || fs.sizeForSeq(fseq) <= len(msg)+len(hdr) { + if fs.cfg.MaxBytes > 0 && fs.state.Bytes+fileStoreMsgSize(subj, hdr, msg) >= uint64(fs.cfg.MaxBytes) { + if !asl || fs.sizeForSeq(fseq) <= int(fileStoreMsgSize(subj, hdr, msg)) { return ErrMaxBytes } } @@ -2960,7 +3009,7 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in } // Adjust top level tracking of per subject msg counts. - if len(subj) > 0 { + if len(subj) > 0 && fs.psim != nil { index := fs.lmb.index if info, ok := fs.psim[subj]; ok { info.total++ @@ -2968,7 +3017,7 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in info.lblk = index } } else { - fs.psim[subj] = &psi{total: 1, fblk: index, lblk: index} + fs.psim[subj] = &psi{total: 1, fblk: index, lblk: index, subj: subj} fs.tsl += len(subj) } } @@ -3083,9 +3132,9 @@ func (mb *msgBlock) skipMsg(seq uint64, now time.Time) { mb.mu.Lock() // If we are empty can just do meta. if mb.msgs == 0 { - mb.last.seq = seq + atomic.StoreUint64(&mb.last.seq, seq) mb.last.ts = nowts - mb.first.seq = seq + 1 + atomic.StoreUint64(&mb.first.seq, seq+1) mb.first.ts = nowts } else { needsRecord = true @@ -3343,7 +3392,7 @@ func (fs *fileStore) EraseMsg(seq uint64) (bool, error) { // Convenience function to remove per subject tracking at the filestore level. // Lock should be held. func (fs *fileStore) removePerSubject(subj string) { - if len(subj) == 0 { + if len(subj) == 0 || fs.psim == nil { return } // We do not update sense of fblk here but will do so when we resolve during lookup. @@ -3411,7 +3460,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( mb.mu.Lock() // See if we are closed or the sequence number is still relevant. - if mb.closed || seq < mb.first.seq { + if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) { mb.mu.Unlock() fsUnlock() return false, nil @@ -3443,7 +3492,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( return false, ErrStoreClosed } mb.mu.Lock() - if mb.closed || seq < mb.first.seq { + if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) { mb.mu.Unlock() fsUnlock() return false, nil @@ -3504,7 +3553,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( mb.eraseMsg(seq, int(ri), int(rl)) } - fifo := seq == mb.first.seq + fifo := seq == atomic.LoadUint64(&mb.first.seq) isLastBlock := mb == fs.lmb isEmpty := mb.msgs == 0 @@ -3513,7 +3562,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( if !isEmpty { // Can update this one in place. if seq == fs.state.FirstSeq { - fs.state.FirstSeq = mb.first.seq // new one. + fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one. if mb.first.ts == 0 { fs.state.FirstTime = time.Time{} } else { @@ -3611,8 +3660,9 @@ func (mb *msgBlock) compact() { var le = binary.LittleEndian var firstSet bool + fseq := atomic.LoadUint64(&mb.first.seq) isDeleted := func(seq uint64) bool { - return seq == 0 || seq&ebit != 0 || seq < mb.first.seq || mb.dmap.Exists(seq) + return seq == 0 || seq&ebit != 0 || mb.dmap.Exists(seq) || seq < fseq } for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { @@ -3635,21 +3685,21 @@ func (mb *msgBlock) compact() { // Check for tombstones. if seq&tbit != 0 { // If we are last mb we should consider to keep these unless the tombstone reflects a seq in this mb. - if mb == mb.fs.lmb && seq < mb.first.seq { + if mb == mb.fs.lmb && seq < fseq { nbuf = append(nbuf, buf[index:index+rl]...) } } else { // Normal message here. nbuf = append(nbuf, buf[index:index+rl]...) if !firstSet { - firstSet = true - mb.first.seq = seq + firstSet, fseq = true, seq + atomic.StoreUint64(&mb.first.seq, seq) } } } // Always set last as long as not a tombstone. if seq&tbit == 0 { - mb.last.seq = seq &^ ebit + atomic.StoreUint64(&mb.last.seq, seq&^ebit) } // Advance to next record. index += rl @@ -3905,7 +3955,7 @@ func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { checkDmap := mb.dmap.Size() > 0 var smv StoreMsg - for seq := mb.last.seq; seq > sm.seq; seq-- { + for seq := atomic.LoadUint64(&mb.last.seq); seq > sm.seq; seq-- { if checkDmap { if mb.dmap.Exists(seq) { // Delete and skip to next. @@ -3992,7 +4042,7 @@ func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { } // Update our last msg. - mb.last.seq = sm.seq + atomic.StoreUint64(&mb.last.seq, sm.seq) mb.last.ts = sm.ts // Clear our cache. @@ -4009,15 +4059,16 @@ func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { return purged, bytes, nil } -// Lock should be held. +// Helper to determine if the mb is empty. func (mb *msgBlock) isEmpty() bool { - return mb.first.seq > mb.last.seq + return atomic.LoadUint64(&mb.first.seq) > atomic.LoadUint64(&mb.last.seq) } // Lock should be held. func (mb *msgBlock) selectNextFirst() { var seq uint64 - for seq = mb.first.seq + 1; seq <= mb.last.seq; seq++ { + fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) + for seq = fseq + 1; seq <= lseq; seq++ { if mb.dmap.Exists(seq) { // We will move past this so we can delete the entry. mb.dmap.Delete(seq) @@ -4026,10 +4077,10 @@ func (mb *msgBlock) selectNextFirst() { } } // Set new first sequence. - mb.first.seq = seq + atomic.StoreUint64(&mb.first.seq, seq) // Check if we are empty.. - if mb.isEmpty() { + if seq > lseq { mb.first.ts = 0 return } @@ -4057,7 +4108,7 @@ func (fs *fileStore) selectNextFirst() { if len(fs.blks) > 0 { mb := fs.blks[0] mb.mu.RLock() - fs.state.FirstSeq = mb.first.seq + fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() mb.mu.RUnlock() } else { @@ -4307,7 +4358,7 @@ func (fs *fileStore) checkMsgs() *LostStreamData { // FIXME(dlc) - check tombstones here too? if ld, _, err := mb.rebuildState(); err != nil && ld != nil { // Rebuild fs state too. - mb.fs.rebuildStateLocked(ld) + fs.rebuildStateLocked(ld) } fs.populateGlobalPerSubjectInfo(mb) } @@ -4545,8 +4596,9 @@ func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { seq = seq &^ ebit } - if (mb.first.seq == 0 || mb.first.ts == 0) && seq >= mb.first.seq { - mb.first.seq = seq + fseq := atomic.LoadUint64(&mb.first.seq) + if (fseq == 0 || mb.first.ts == 0) && seq >= fseq { + atomic.StoreUint64(&mb.first.seq, seq) mb.first.ts = ts } // Need atomics here for selectMsgBlock speed. @@ -4785,8 +4837,9 @@ func (fs *fileStore) syncBlocks() { } // Check if we should compact here as well. // Do not compact last mb. + var needsCompact bool if mb != lmb && mb.ensureRawBytesLoaded() == nil && mb.rbytes > mb.bytes { - mb.compact() + needsCompact = true markDirty = true } @@ -4798,6 +4851,16 @@ func (fs *fileStore) syncBlocks() { } mb.mu.Unlock() + // Check if we should compact here. + // Need to hold fs lock in case we reference psim when loading in the mb. + if needsCompact { + fs.mu.RLock() + mb.mu.Lock() + mb.compact() + mb.mu.Unlock() + fs.mu.RUnlock() + } + // Check if we need to sync. // This is done not holding any locks. if needSync { @@ -4815,7 +4878,11 @@ func (fs *fileStore) syncBlocks() { } fs.mu.Lock() - fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) + if fs.closed { + fs.mu.Unlock() + return + } + fs.setSyncTimer() fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) syncAlways := fs.fcfg.SyncAlways if markDirty { @@ -4911,9 +4978,11 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { var idx []uint32 var index uint32 + mbFirstSeq := atomic.LoadUint64(&mb.first.seq) + if mb.cache == nil { // Approximation, may adjust below. - fseq = mb.first.seq + fseq = mbFirstSeq idx = make([]uint32, 0, mb.msgs) mb.cache = &cache{} } else { @@ -4962,11 +5031,11 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { // We defer checksum checks to individual msg cache lookups to amortorize costs and // not introduce latency for first message from a newly loaded block. - if seq >= mb.first.seq { + if seq >= mbFirstSeq { // Track that we do not have holes. - if slot := int(seq - mb.first.seq); slot != len(idx) { + if slot := int(seq - mbFirstSeq); slot != len(idx) { // If we have a hole fill it. - for dseq := mb.first.seq + uint64(len(idx)); dseq < seq; dseq++ { + for dseq := mbFirstSeq + uint64(len(idx)); dseq < seq; dseq++ { idx = append(idx, dbit) mb.dmap.Insert(dseq) } @@ -4991,8 +5060,7 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { ss.Msgs++ ss.Last = seq } else { - subj := mb.subjString(bsubj) - mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} + mb.fss[mb.subjString(bsubj)] = &SimpleState{Msgs: 1, First: seq, Last: seq} } } } @@ -5163,7 +5231,7 @@ func (mb *msgBlock) cacheAlreadyLoaded() bool { if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 { return false } - numEntries := mb.msgs + uint64(mb.dmap.Size()) + (mb.first.seq - mb.cache.fseq) + numEntries := mb.msgs + uint64(mb.dmap.Size()) + (atomic.LoadUint64(&mb.first.seq) - mb.cache.fseq) return numEntries == uint64(len(mb.cache.idx)) } @@ -5334,7 +5402,7 @@ func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) if err != nil { return nil, false, err } - expireOk := seq == mb.last.seq && mb.llseq == seq + expireOk := seq == atomic.LoadUint64(&mb.last.seq) && mb.llseq == seq return fsm, expireOk, err } @@ -5372,7 +5440,7 @@ const ( // Will do a lookup from cache. // Lock should be held. func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { - if seq < mb.first.seq || seq > mb.last.seq { + if seq < atomic.LoadUint64(&mb.first.seq) || seq > atomic.LoadUint64(&mb.last.seq) { return nil, ErrStoreMsgNotFound } @@ -5566,50 +5634,22 @@ func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*Store return sm, nil } -// Used to intern strings for subjects. -// Based on idea from https://github.com/josharian/intern/blob/master/intern.go -var subjPool = sync.Pool{ - New: func() any { - return make(map[string]string) - }, -} - -// Get an interned string from a byte slice. -func subjFromBytes(b []byte) string { - sm := subjPool.Get().(map[string]string) - defer subjPool.Put(sm) - subj, ok := sm[string(b)] - if ok { - return subj - } - s := string(b) - sm[s] = s - return s -} - // Given the `key` byte slice, this function will return the subject // as an interned string of `key` or a configured subject as to minimize memory allocations. +// We used to have a pool structure when we leaned on block fss, which could duplicate subjects. +// Now we have fs scoped PSIM that is always present and is already tracking all in-use subjects. // Lock should be held. func (fs *fileStore) subjString(skey []byte) string { if fs == nil || len(skey) == 0 { return _EMPTY_ } - - if lsubjs := len(fs.cfg.Subjects); lsubjs > 0 { - if lsubjs == 1 { - // The cast for the comparison does not make a copy - if string(skey) == fs.cfg.Subjects[0] { - return fs.cfg.Subjects[0] - } - } else { - for _, subj := range fs.cfg.Subjects { - if string(skey) == subj { - return subj - } - } + if len(fs.psim) > 0 { + // Cast in place below to avoid allocation for lookup. + if psi := fs.psim[string(skey)]; psi != nil { + return psi.subj } } - return subjFromBytes(skey) + return string(skey) } // Given the `key` byte slice, this function will return the subject @@ -5667,7 +5707,7 @@ func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err err } } if l == 0 { - _, _, l = mb.filteredPendingLocked(subj, wc, mb.first.seq) + _, _, l = mb.filteredPendingLocked(subj, wc, atomic.LoadUint64(&mb.first.seq)) } if l > 0 { if mb.cacheNotLoaded() { @@ -5774,14 +5814,14 @@ func (fs *fileStore) State() StreamState { for _, mb := range fs.blks { mb.mu.Lock() - fseq := mb.first.seq + fseq := atomic.LoadUint64(&mb.first.seq) // Account for messages missing from the head. if fseq > cur { for seq := cur; seq < fseq; seq++ { state.Deleted = append(state.Deleted, seq) } } - cur = mb.last.seq + 1 // Expected next first. + cur = atomic.LoadUint64(&mb.last.seq) + 1 // Expected next first. mb.dmap.Range(func(seq uint64) bool { if seq < fseq { @@ -5910,9 +5950,9 @@ func (mb *msgBlock) readIndexInfo() error { } mb.msgs = readCount() mb.bytes = readCount() - mb.first.seq = readSeq() + atomic.StoreUint64(&mb.first.seq, readSeq()) mb.first.ts = readTimeStamp() - mb.last.seq = readSeq() + atomic.StoreUint64(&mb.last.seq, readSeq()) mb.last.ts = readTimeStamp() dmapLen := readCount() @@ -5923,7 +5963,7 @@ func (mb *msgBlock) readIndexInfo() error { } // Check for consistency if accounting. If something is off bail and we will rebuild. - if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen { + if mb.msgs != (atomic.LoadUint64(&mb.last.seq)-atomic.LoadUint64(&mb.first.seq)+1)-dmapLen { os.Remove(ifn) return fmt.Errorf("accounting inconsistent") } @@ -5943,12 +5983,12 @@ func (mb *msgBlock) readIndexInfo() error { mb.dmap = *dmap } else { // This is the old version. - for i := 0; i < int(dmapLen); i++ { + for i, fseq := 0, atomic.LoadUint64(&mb.first.seq); i < int(dmapLen); i++ { seq := readSeq() if seq == 0 { break } - mb.dmap.Insert(seq + mb.first.seq) + mb.dmap.Insert(seq + fseq) } } } @@ -6050,7 +6090,7 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint mb.mu.Unlock() continue } - t, f, l := mb.filteredPendingLocked(subject, wc, mb.first.seq) + t, f, l := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq)) if t == 0 { mb.mu.Unlock() continue @@ -6092,7 +6132,7 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint fs.removePerSubject(sm.subj) // Check for first message. - if seq == mb.first.seq { + if seq == atomic.LoadUint64(&mb.first.seq) { mb.selectNextFirst() if mb.isEmpty() { fs.removeMsgBlock(mb) @@ -6100,7 +6140,7 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint // keep flag set, if set previously firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq } else if seq == fs.state.FirstSeq { - fs.state.FirstSeq = mb.first.seq // new one. + fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one. fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() } } else { @@ -6205,14 +6245,14 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { } lmb := fs.lmb - lmb.first.seq = fs.state.FirstSeq - lmb.last.seq = fs.state.LastSeq + atomic.StoreUint64(&lmb.first.seq, fs.state.FirstSeq) + atomic.StoreUint64(&lmb.last.seq, fs.state.LastSeq) lmb.last.ts = fs.state.LastTime.UnixNano() - if fs.lmb.last.seq > 1 { + if lseq := atomic.LoadUint64(&lmb.last.seq); lseq > 1 { // Leave a tombstone so we can remember our starting sequence in case // full state becomes corrupted. - lmb.writeTombstone(fs.lmb.last.seq, fs.lmb.last.ts) + lmb.writeTombstone(lseq, lmb.last.ts) } cb := fs.scb @@ -6275,7 +6315,7 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { var isEmpty bool smb.mu.Lock() - if smb.first.seq == seq { + if atomic.LoadUint64(&smb.first.seq) == seq { goto SKIP } @@ -6285,7 +6325,7 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { goto SKIP } } - for mseq := smb.first.seq; mseq < seq; mseq++ { + for mseq := atomic.LoadUint64(&smb.first.seq); mseq < seq; mseq++ { sm, err := smb.cacheLookup(mseq, &smv) if err == errDeletedMsg { // Update dmap. @@ -6314,24 +6354,24 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { if isEmpty { smb.dirtyCloseWithRemove(true) // Update fs first here as well. - fs.state.FirstSeq = smb.last.seq + 1 + fs.state.FirstSeq = atomic.LoadUint64(&smb.last.seq) + 1 fs.state.FirstTime = time.Time{} deleted++ } else { // Make sure to sync changes. smb.needSync = true // Update fs first seq and time. - smb.first.seq = seq - 1 // Just for start condition for selectNextFirst. + atomic.StoreUint64(&smb.first.seq, seq-1) // Just for start condition for selectNextFirst. smb.selectNextFirst() - fs.state.FirstSeq = smb.first.seq + fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq) fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC() // Check if we should reclaim the head space from this block. // This will be optimistic only, so don't continue if we encounter any errors here. if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes { var moff uint32 - moff, _, _, err = smb.slotInfo(int(smb.first.seq - smb.cache.fseq)) + moff, _, _, err = smb.slotInfo(int(atomic.LoadUint64(&smb.first.seq) - smb.cache.fseq)) if err != nil || moff >= uint32(len(smb.cache.buf)) { goto SKIP } @@ -6595,12 +6635,12 @@ func (fs *fileStore) removeMsgBlock(mb *msgBlock) { fs.removeMsgBlockFromList(mb) // Check for us being last message block if mb == fs.lmb { - last := mb.last + lseq, lts := atomic.LoadUint64(&mb.last.seq), mb.last.ts // Creating a new message write block requires that the lmb lock is not held. mb.mu.Unlock() // Write the tombstone to remember since this was last block. if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { - lmb.writeTombstone(last.seq, last.ts) + lmb.writeTombstone(lseq, lts) } mb.mu.Lock() } @@ -6704,7 +6744,7 @@ func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *Si } var le = binary.LittleEndian - for slot := startSlot; slot < len(mb.cache.idx); slot++ { + for slot, fseq := startSlot, atomic.LoadUint64(&mb.first.seq); slot < len(mb.cache.idx); slot++ { bi := mb.cache.idx[slot] &^ hbit if bi == dbit { // delete marker so skip. @@ -6720,7 +6760,7 @@ func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *Si slen := int(le.Uint16(hdr[20:])) if subj == string(buf[msgHdrSize:msgHdrSize+slen]) { seq := le.Uint64(hdr[4:]) - if seq < mb.first.seq || seq&ebit != 0 || mb.dmap.Exists(seq) { + if seq < fseq || seq&ebit != 0 || mb.dmap.Exists(seq) { continue } ss.First = seq @@ -6766,7 +6806,7 @@ func (mb *msgBlock) generatePerSubjectInfo() error { mb.fss = make(map[string]*SimpleState) var smv StoreMsg - fseq, lseq := mb.first.seq, mb.last.seq + fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) for seq := fseq; seq <= lseq; seq++ { sm, err := mb.cacheLookup(seq, &smv) if err != nil { @@ -6829,7 +6869,7 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { info.lblk = mb.index } } else { - fs.psim[subj] = &psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index} + fs.psim[subj] = &psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index, subj: subj} fs.tsl += len(subj) } } @@ -6911,18 +6951,41 @@ func (fs *fileStore) Delete() error { return err } - err := os.RemoveAll(fs.fcfg.StoreDir) - if err == nil { - return nil + // Make sure we will not try to recover if killed before removal below completes. + if err := os.Remove(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)); err != nil { + return err } - ttl := time.Now().Add(time.Second) - for time.Now().Before(ttl) { - time.Sleep(10 * time.Millisecond) - if err = os.RemoveAll(fs.fcfg.StoreDir); err == nil { - return nil + // Now move into different directory with "." prefix. + ndir := filepath.Join(filepath.Dir(fs.fcfg.StoreDir), tsep+filepath.Base(fs.fcfg.StoreDir)) + if err := os.Rename(fs.fcfg.StoreDir, ndir); err != nil { + return err + } + // Do this in separate Go routine in case lots of blocks. + // Purge above protects us as does the removal of meta artifacts above. + go func() { + err := os.RemoveAll(ndir) + if err == nil { + return } + ttl := time.Now().Add(time.Second) + for time.Now().Before(ttl) { + time.Sleep(10 * time.Millisecond) + if err = os.RemoveAll(ndir); err == nil { + return + } + } + }() + + return nil +} + +// Lock should be held. +func (fs *fileStore) setSyncTimer() { + if fs.syncTmr != nil { + fs.syncTmr.Reset(fs.fcfg.SyncInterval) + } else { + fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) } - return err } // Lock should be held. @@ -6942,12 +7005,39 @@ const ( // This will get kicked when we create a new block or when we delete a block in general. // This is also called during Stop(). func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) { + // Make sure we do not try to write these out too fast. + const writeThreshold = time.Second * 10 + lastWrite := time.Time{} + + // We will use these to complete the full state write while not doing them too fast. + var dt *time.Timer + var dtc <-chan time.Time + + defer close(done) + for { select { case <-fch: + if elapsed := time.Since(lastWrite); elapsed > writeThreshold { + fs.writeFullState() + lastWrite = time.Now() + if dt != nil { + dt.Stop() + dt, dtc = nil, nil + } + } else if dtc == nil { + fireIn := time.Until(lastWrite.Add(writeThreshold)) + if fireIn < 0 { + fireIn = 100 * time.Millisecond + } + dt = time.NewTimer(fireIn) + dtc = dt.C + } + case <-dtc: fs.writeFullState() + lastWrite = time.Now() + dt, dtc = nil, nil case <-qch: - close(done) return } } @@ -6981,6 +7071,13 @@ func (fs *fileStore) writeFullState() error { return nil } + // We track this through subsequent runs to get an avg per blk used for subsequent runs. + avgDmapLen := fs.adml + // If first time through could be 0 + if avgDmapLen == 0 && ((fs.state.LastSeq-fs.state.FirstSeq+1)-fs.state.Msgs) > 0 { + avgDmapLen = 1024 + } + // For calculating size. numSubjects := len(fs.psim) @@ -6989,10 +7086,20 @@ func (fs *fileStore) writeFullState() error { (binary.MaxVarintLen64 * 6) + // FS data binary.MaxVarintLen64 + fs.tsl + // NumSubjects + total subject length numSubjects*(binary.MaxVarintLen64*4) + // psi record - len(fs.blks)*((binary.MaxVarintLen64*6)+512) + // msg blocks, 512 is est for dmap - binary.MaxVarintLen64 + 8 // last index + checksum + binary.MaxVarintLen64 + // Num blocks. + len(fs.blks)*((binary.MaxVarintLen64*7)+avgDmapLen) + // msg blocks, avgDmapLen is est for dmaps + binary.MaxVarintLen64 + 8 + 8 // last index + record checksum + full state checksum + + // Do 4k on stack if possible. + var raw [4 * 1024]byte + var buf []byte + + if sz <= cap(raw) { + buf, sz = raw[0:2:cap(raw)], cap(raw) + } else { + buf = make([]byte, hdrLen, sz) + } - buf := make([]byte, hdrLen, sz) buf[0], buf[1] = fullStateMagic, fullStateVersion buf = binary.AppendUvarint(buf, fs.state.Msgs) buf = binary.AppendUvarint(buf, fs.state.Bytes) @@ -7026,19 +7133,21 @@ func (fs *fileStore) writeFullState() error { baseTime := timestampNormalized(fs.state.FirstTime) var scratch [8 * 1024]byte + var dmapTotalLen int for _, mb := range fs.blks { mb.mu.RLock() buf = binary.AppendUvarint(buf, uint64(mb.index)) buf = binary.AppendUvarint(buf, mb.bytes) - buf = binary.AppendUvarint(buf, mb.first.seq) + buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.first.seq)) buf = binary.AppendVarint(buf, mb.first.ts-baseTime) - buf = binary.AppendUvarint(buf, mb.last.seq) + buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.last.seq)) buf = binary.AppendVarint(buf, mb.last.ts-baseTime) numDeleted := mb.dmap.Size() buf = binary.AppendUvarint(buf, uint64(numDeleted)) if numDeleted > 0 { dmap, _ := mb.dmap.Encode(scratch[:0]) + dmapTotalLen += len(dmap) buf = append(buf, dmap...) } // If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index. @@ -7050,6 +7159,9 @@ func (fs *fileStore) writeFullState() error { } mb.mu.RUnlock() } + if dmapTotalLen > 0 { + fs.adml = dmapTotalLen / len(fs.blks) + } // Place block index and hash onto the end. buf = binary.AppendUvarint(buf, uint64(lbi)) @@ -7114,11 +7226,15 @@ func (fs *fileStore) writeFullState() error { // Stop the current filestore. func (fs *fileStore) Stop() error { fs.mu.Lock() - if fs.closed { + if fs.closed || fs.closing { fs.mu.Unlock() return ErrStoreClosed } + // Mark as closing. Do before releasing the lock to writeFullState + // so we don't end up with this function running more than once. + fs.closing = true + fs.checkAndFlushAllBlocks() fs.closeAllMsgBlocks(false) @@ -7126,7 +7242,10 @@ func (fs *fileStore) Stop() error { fs.cancelAgeChk() // Release the state flusher loop. - close(fs.qch) + if fs.qch != nil { + close(fs.qch) + fs.qch = nil + } // Wait for the state flush loop to exit. fsld := fs.fsld @@ -7136,7 +7255,8 @@ func (fs *fileStore) Stop() error { fs.writeFullState() fs.mu.Lock() - // Mark as closed. + // Mark as closed. Last message block needs to be cleared after + // writeFullState has completed. fs.closed = true fs.lmb = nil @@ -7468,14 +7588,14 @@ func (fs *fileStore) deleteBlocks() DeleteBlocks { for _, mb := range fs.blks { // Detect if we have a gap between these blocks. - if prevLast > 0 && prevLast+1 != mb.first.seq { - gap := mb.first.seq - prevLast - 1 - dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: gap}) + fseq := atomic.LoadUint64(&mb.first.seq) + if prevLast > 0 && prevLast+1 != fseq { + dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: fseq - prevLast - 1}) } if mb.dmap.Size() > 0 { dbs = append(dbs, &mb.dmap) } - prevLast = mb.last.seq + prevLast = atomic.LoadUint64(&mb.last.seq) } return dbs } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go index e029f4cdf1..cc23fd4b22 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go @@ -1197,6 +1197,11 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits) erro fis, _ := os.ReadDir(sdir) for _, fi := range fis { mdir := filepath.Join(sdir, fi.Name()) + // Check for partially deleted streams. They are marked with "." prefix. + if strings.HasPrefix(fi.Name(), tsep) { + go os.RemoveAll(mdir) + continue + } key := sha256.Sum256([]byte(fi.Name())) hh, err := highwayhash.New64(key[:]) if err != nil { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go index b5436facb3..d55b3e0bcd 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go @@ -2143,7 +2143,7 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps // from underneath the one that is running since it will be the same raft node. defer n.Stop() - qch, lch, aq, uch, ourPeerId := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC(), meta.ID() + qch, mqch, lch, aq, uch, ourPeerId := n.QuitC(), mset.monitorQuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC(), meta.ID() s.Debugf("Starting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) defer s.Debugf("Exiting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) @@ -2249,7 +2249,7 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps startDirectAccessMonitoring := func() { if dat == nil { - dat = time.NewTicker(1 * time.Second) + dat = time.NewTicker(2 * time.Second) datc = dat.C } } @@ -2301,6 +2301,8 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps select { case <-s.quitCh: return + case <-mqch: + return case <-qch: return case <-aq.ch: @@ -2322,6 +2324,10 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps ne, nb = n.Applied(ce.Index) ce.ReturnToPool() } else { + // Our stream was closed out from underneath of us, simply return here. + if err == errStreamClosed { + return + } s.Warnf("Error applying entries to '%s > %s': %v", accName, sa.Config.Name, err) if isClusterResetErr(err) { if mset.isMirror() && mset.IsLeader() { @@ -2349,19 +2355,15 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps case isLeader = <-lch: if isLeader { - if mset != nil && n != nil { - // Send a snapshot if being asked or if we are tracking - // a failed state so that followers sync. - if clfs := mset.clearCLFS(); clfs > 0 || sendSnapshot { - n.SendSnapshot(mset.stateSnapshot()) - sendSnapshot = false - } + if mset != nil && n != nil && sendSnapshot { + n.SendSnapshot(mset.stateSnapshot()) + sendSnapshot = false } if isRestore { acc, _ := s.LookupAccount(sa.Client.serviceAccount()) restoreDoneCh = s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_) continue - } else if n.NeedSnapshot() { + } else if n != nil && n.NeedSnapshot() { doSnapshot() } // Always cancel if this was running. @@ -2388,17 +2390,22 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps // Here we are checking if we are not the leader but we have been asked to allow // direct access. We now allow non-leaders to participate in the queue group. if !isLeader && mset != nil { - startDirectAccessMonitoring() + mset.mu.RLock() + ad, md := mset.cfg.AllowDirect, mset.cfg.MirrorDirect + mset.mu.RUnlock() + if ad || md { + startDirectAccessMonitoring() + } } case <-datc: if mset == nil || isRecovering { - return + continue } // If we are leader we can stop, we know this is setup now. if isLeader { stopDirectMonitoring() - return + continue } mset.mu.Lock() @@ -2550,6 +2557,8 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps mset.setStreamAssignment(sa) // Make sure to update our updateC which would have been nil. uch = mset.updateC() + // Also update our mqch + mqch = mset.monitorQuitC() } } if err != nil { @@ -2782,6 +2791,7 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco // Grab last sequence and CLFS. last, clfs := mset.lastSeqAndCLFS() + // We can skip if we know this is less than what we already have. if lseq-clfs < last { s.Debugf("Apply stream entries for '%s > %s' skipping message with sequence %d with last of %d", @@ -2812,13 +2822,14 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco // Process the actual message here. if err := mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts); err != nil { - // Only return in place if we are going to reset stream or we are out of space. - if isClusterResetErr(err) || isOutOfSpaceErr(err) { + // Only return in place if we are going to reset our stream or we are out of space, or we are closed. + if isClusterResetErr(err) || isOutOfSpaceErr(err) || err == errStreamClosed { return err } s.Debugf("Apply stream entries for '%s > %s' got error processing message: %v", mset.account(), mset.name(), err) } + case deleteMsgOp: md, err := decodeMsgDelete(buf[1:]) if err != nil { @@ -5950,10 +5961,13 @@ func sysRequest[T any](s *Server, subjFormat string, args ...interface{}) (*T, e } }() + ttl := time.NewTimer(2 * time.Second) + defer ttl.Stop() + select { case <-s.quitCh: return nil, errReqSrvExit - case <-time.After(2 * time.Second): + case <-ttl.C: return nil, errReqTimeout case data := <-results: return data, nil @@ -6086,6 +6100,12 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su if isReplicaChange { // We are adding new peers here. if newCfg.Replicas > len(rg.Peers) { + // Check that we have the allocation available. + if err := js.jsClusteredStreamLimitsCheck(acc, newCfg); err != nil { + resp.Error = err + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } // Check if we do not have a cluster assigned, and if we do not make sure we // try to pick one. This could happen with older streams that were assigned by // previous servers. @@ -6957,7 +6977,7 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec // Also short circuit if DeliverLastPerSubject is set with no FilterSubject. if cfg.DeliverPolicy == DeliverLastPerSubject { - if cfg.FilterSubject == _EMPTY_ { + if cfg.FilterSubject == _EMPTY_ && len(cfg.FilterSubjects) == 0 { resp.Error = NewJSConsumerInvalidPolicyError(fmt.Errorf("consumer delivery policy is deliver last per subject, but FilterSubject is not set")) s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) return @@ -7382,7 +7402,7 @@ func (mset *stream) stateSnapshotLocked() []byte { Bytes: state.Bytes, FirstSeq: state.FirstSeq, LastSeq: state.LastSeq, - Failed: mset.clfs, + Failed: mset.getCLFS(), Deleted: state.Deleted, } b, _ := json.Marshal(snap) @@ -7419,7 +7439,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ mset.mu.RLock() canRespond := !mset.cfg.NoAck && len(reply) > 0 - name, stype, store := mset.cfg.Name, mset.cfg.Storage, mset.store + name, stype := mset.cfg.Name, mset.cfg.Storage s, js, jsa, st, rf, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node maxMsgSize, lseq, clfs := int(mset.cfg.MaxMsgSize), mset.lseq, mset.clfs isLeader, isSealed := mset.isLeader(), mset.cfg.Sealed @@ -7519,26 +7539,6 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // Some header checks can be checked pre proposal. Most can not. if len(hdr) > 0 { - // For CAS operations, e.g. ExpectedLastSeqPerSubject, we can also check here and not have to go through. - // Can only precheck for seq != 0. - if seq, exists := getExpectedLastSeqPerSubject(hdr); exists && store != nil && seq > 0 { - var smv StoreMsg - var fseq uint64 - sm, err := store.LoadLastMsg(subject, &smv) - if sm != nil { - fseq = sm.seq - } - if err != nil || fseq != seq { - if canRespond { - var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} - resp.PubAck = &PubAck{Stream: name} - resp.Error = NewJSStreamWrongLastSequenceError(fseq) - b, _ := json.Marshal(resp) - outq.sendMsg(reply, b) - } - return fmt.Errorf("last sequence by subject mismatch: %d vs %d", seq, fseq) - } - } // Expected stream name can also be pre-checked. if sname := getExpectedStream(hdr); sname != _EMPTY_ && sname != name { if canRespond { @@ -7746,8 +7746,8 @@ func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) { mset.mu.Lock() var state StreamState - mset.clfs = snap.Failed mset.store.FastState(&state) + mset.setCLFS(snap.Failed) sreq := mset.calculateSyncRequest(&state, snap) s, js, subject, n := mset.srv, mset.js, mset.sa.Sync, mset.node diff --git a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go index d5d41c5336..97fa8b4197 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go @@ -25,6 +25,7 @@ import ( "net/http" "net/url" "os" + "path" "reflect" "regexp" "runtime" @@ -2349,7 +2350,7 @@ func (c *client) processLeafSub(argo []byte) (err error) { // Only add in shadow subs if a new sub or qsub. if osub == nil { - if err := c.addShadowSubscriptions(acc, sub); err != nil { + if err := c.addShadowSubscriptions(acc, sub, true); err != nil { c.Errorf(err.Error()) } } @@ -2784,14 +2785,16 @@ func (c *client) leafNodeSolicitWSConnection(opts *Options, rURL *url.URL, remot // create a LEAF connection, not a CLIENT. // In case we use the user's URL path in the future, make sure we append the user's // path to our `/leafnode` path. - path := leafNodeWSPath + lpath := leafNodeWSPath if curPath := rURL.EscapedPath(); curPath != _EMPTY_ { if curPath[0] == '/' { curPath = curPath[1:] } - path += curPath + lpath = path.Join(curPath, lpath) + } else { + lpath = lpath[1:] } - ustr := fmt.Sprintf("%s://%s%s", scheme, rURL.Host, path) + ustr := fmt.Sprintf("%s://%s/%s", scheme, rURL.Host, lpath) u, _ := url.Parse(ustr) req := &http.Request{ Method: "GET", diff --git a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go index 0d037be673..adf660846c 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go @@ -129,7 +129,7 @@ func (ms *memStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int return ErrMaxMsgs } } - if ms.cfg.MaxBytes > 0 && ms.state.Bytes+uint64(len(msg)+len(hdr)) >= uint64(ms.cfg.MaxBytes) { + if ms.cfg.MaxBytes > 0 && ms.state.Bytes+memStoreMsgSize(subj, hdr, msg) >= uint64(ms.cfg.MaxBytes) { if !asl { return ErrMaxBytes } @@ -138,7 +138,7 @@ func (ms *memStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int ms.recalculateFirstForSubj(subj, ss.First, ss) } sm, ok := ms.msgs[ss.First] - if !ok || memStoreMsgSize(sm.subj, sm.hdr, sm.msg) < uint64(len(msg)+len(hdr)) { + if !ok || memStoreMsgSize(sm.subj, sm.hdr, sm.msg) < memStoreMsgSize(subj, hdr, msg) { return ErrMaxBytes } } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go b/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go index 7302722007..b347936f5d 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go @@ -155,6 +155,7 @@ const ( // while "$MQTT.JSA..SL." is for a stream lookup, etc... mqttJSAIdTokenPos = 3 mqttJSATokenPos = 4 + mqttJSAClientIDPos = 5 mqttJSAStreamCreate = "SC" mqttJSAStreamUpdate = "SU" mqttJSAStreamLookup = "SL" @@ -237,10 +238,9 @@ type mqttAccountSessionManager struct { sl *Sublist // sublist allowing to find retained messages for given subscription retmsgs map[string]*mqttRetainedMsgRef // retained messages jsa mqttJSA - rrmLastSeq uint64 // Restore retained messages expected last sequence - rrmDoneCh chan struct{} // To notify the caller that all retained messages have been loaded - sp *ipQueue[uint64] // Used for cluster-wide processing of session records being persisted - domainTk string // Domain (with trailing "."), or possibly empty. This is added to session subject. + rrmLastSeq uint64 // Restore retained messages expected last sequence + rrmDoneCh chan struct{} // To notify the caller that all retained messages have been loaded + domainTk string // Domain (with trailing "."), or possibly empty. This is added to session subject. } type mqttJSA struct { @@ -1109,7 +1109,6 @@ func (s *Server) mqttCreateAccountSessionManager(acc *Account, quitCh chan struc nuid: nuid.New(), quitCh: quitCh, }, - sp: newIPQueue[uint64](s, qname+"sp"), } // TODO record domain name in as here @@ -1170,14 +1169,15 @@ func (s *Server) mqttCreateAccountSessionManager(acc *Account, quitCh chan struc // This is a subscription that will process all JS API replies. We could split to // individual subscriptions if needed, but since there is a bit of common code, // that seemed like a good idea to be all in one place. - if err := as.createSubscription(jsa.rplyr+"*.*", + if err := as.createSubscription(jsa.rplyr+">", as.processJSAPIReplies, &sid, &subs); err != nil { return nil, err } // We will listen for replies to session persist requests so that we can // detect the use of a session with the same client ID anywhere in the cluster. - if err := as.createSubscription(mqttJSARepliesPrefix+"*."+mqttJSASessPersist+".*", + // `$MQTT.JSA.{js-id}.SP.{client-id-hash}.{uuid}` + if err := as.createSubscription(mqttJSARepliesPrefix+"*."+mqttJSASessPersist+".*.*", as.processSessionPersist, &sid, &subs); err != nil { return nil, err } @@ -1203,12 +1203,6 @@ func (s *Server) mqttCreateAccountSessionManager(acc *Account, quitCh chan struc as.sendJSAPIrequests(s, c, accName, closeCh) }) - // Start the go routine that will handle network updates regarding sessions - s.startGoRoutine(func() { - defer s.grWG.Done() - as.sessPersistProcessing(closeCh) - }) - lookupStream := func(stream, txt string) (*StreamInfo, error) { si, err := jsa.lookupStream(stream) if err != nil { @@ -1407,9 +1401,12 @@ func (s *Server) mqttCreateAccountSessionManager(acc *Account, quitCh chan struc } if lastSeq > 0 { + ttl := time.NewTimer(mqttJSAPITimeout) + defer ttl.Stop() + select { case <-rmDoneCh: - case <-time.After(mqttJSAPITimeout): + case <-ttl.C: s.Warnf("Timing out waiting to load %v retained messages", st.Msgs) case <-quitCh: return nil, ErrServerNotRunning @@ -1454,7 +1451,7 @@ func (s *Server) mqttDetermineReplicas() int { ////////////////////////////////////////////////////////////////////////////// func (jsa *mqttJSA) newRequest(kind, subject string, hdr int, msg []byte) (interface{}, error) { - return jsa.newRequestEx(kind, subject, hdr, msg, mqttJSAPITimeout) + return jsa.newRequestEx(kind, subject, _EMPTY_, hdr, msg, mqttJSAPITimeout) } func (jsa *mqttJSA) prefixDomain(subject string) string { @@ -1467,19 +1464,24 @@ func (jsa *mqttJSA) prefixDomain(subject string) string { return subject } -func (jsa *mqttJSA) newRequestEx(kind, subject string, hdr int, msg []byte, timeout time.Duration) (interface{}, error) { +func (jsa *mqttJSA) newRequestEx(kind, subject, cidHash string, hdr int, msg []byte, timeout time.Duration) (interface{}, error) { + var sb strings.Builder jsa.mu.Lock() // Either we use nuid.Next() which uses a global lock, or our own nuid object, but // then it needs to be "write" protected. This approach will reduce across account // contention since we won't use the global nuid's lock. - var sb strings.Builder sb.WriteString(jsa.rplyr) sb.WriteString(kind) sb.WriteByte(btsep) + if cidHash != _EMPTY_ { + sb.WriteString(cidHash) + sb.WriteByte(btsep) + } sb.WriteString(jsa.nuid.Next()) - reply := sb.String() jsa.mu.Unlock() + reply := sb.String() + ch := make(chan interface{}, 1) jsa.replies.Store(reply, ch) @@ -1646,6 +1648,25 @@ func (jsa *mqttJSA) storeMsgWithKind(kind, subject string, headers int, msg []by return smr, smr.ToError() } +func (jsa *mqttJSA) storeSessionMsg(domainTk, cidHash string, hdr int, msg []byte) (*JSPubAckResponse, error) { + // Compute subject where the session is being stored + subject := mqttSessStreamSubjectPrefix + domainTk + cidHash + + // Passing cidHash will add it to the JS reply subject, so that we can use + // it in processSessionPersist. + smri, err := jsa.newRequestEx(mqttJSASessPersist, subject, cidHash, hdr, msg, mqttJSAPITimeout) + if err != nil { + return nil, err + } + smr := smri.(*JSPubAckResponse) + return smr, smr.ToError() +} + +func (jsa *mqttJSA) loadSessionMsg(domainTk, cidHash string) (*StoredMsg, error) { + subject := mqttSessStreamSubjectPrefix + domainTk + cidHash + return jsa.loadLastMsgFor(mqttSessStreamName, subject) +} + func (jsa *mqttJSA) deleteMsg(stream string, seq uint64, wait bool) error { dreq := JSApiMsgDeleteRequest{Seq: seq, NoErase: true} req, _ := json.Marshal(dreq) @@ -1817,6 +1838,7 @@ func (as *mqttAccountSessionManager) processSessionPersist(_ *subscription, pc * if tokenAt(subject, mqttJSAIdTokenPos) == as.jsa.id { return } + cIDHash := tokenAt(subject, mqttJSAClientIDPos) _, msg := pc.msgParts(rmsg) if len(msg) < LEN_CR_LF { return @@ -1839,18 +1861,6 @@ func (as *mqttAccountSessionManager) processSessionPersist(_ *subscription, pc * if ignore { return } - // We would need to lookup the message and that would be a request/reply, - // which we can't do in place here. So move that to a long-running routine - // that will process the session persist record. - as.sp.push(par.Sequence) -} - -func (as *mqttAccountSessionManager) processSessPersistRecord(seq uint64) { - smsg, err := as.jsa.loadMsg(mqttSessStreamName, seq) - if err != nil { - return - } - cIDHash := strings.TrimPrefix(smsg.Subject, mqttSessStreamSubjectPrefix+as.domainTk) as.mu.Lock() defer as.mu.Unlock() @@ -1861,7 +1871,7 @@ func (as *mqttAccountSessionManager) processSessPersistRecord(seq uint64) { // If our current session's stream sequence is higher, it means that this // update is stale, so we don't do anything here. sess.mu.Lock() - ignore := seq < sess.seq + ignore = par.Sequence < sess.seq sess.mu.Unlock() if ignore { return @@ -1881,28 +1891,6 @@ func (as *mqttAccountSessionManager) processSessPersistRecord(seq uint64) { sess.mu.Unlock() } -func (as *mqttAccountSessionManager) sessPersistProcessing(closeCh chan struct{}) { - as.mu.RLock() - sp := as.sp - quitCh := as.jsa.quitCh - as.mu.RUnlock() - - for { - select { - case <-sp.ch: - seqs := sp.pop() - for _, seq := range seqs { - as.processSessPersistRecord(seq) - } - sp.recycle(&seqs) - case <-closeCh: - return - case <-quitCh: - return - } - } -} - // Adds this client ID to the flappers map, and if needed start the timer // for map cleanup. // @@ -2176,6 +2164,30 @@ func (as *mqttAccountSessionManager) removeSession(sess *mqttSession, lock bool) } } +// Helpers that sets the sub's mqtt fields and possibly serialize +// (pre-loaded) retained messages. +// Session lock held on entry. +func (sess *mqttSession) processSub(c *client, subject, sid []byte, isReserved bool, qos byte, jsDurName string, h msgHandler, initShadow bool) (*subscription, error) { + sub, err := c.processSub(subject, nil, sid, h, false) + if err != nil { + // c.processSub already called c.Errorf(), so no need here. + return nil, err + } + subs := []*subscription{sub} + if initShadow { + subs = append(subs, sub.shadow...) + } + for _, ss := range subs { + if ss.mqtt == nil { + ss.mqtt = &mqttSub{} + } + ss.mqtt.qos = qos + ss.mqtt.reserved = isReserved + ss.mqtt.jsDur = jsDurName + } + return sub, nil +} + // Process subscriptions for the given session/client. // // When `fromSubProto` is false, it means that this is invoked from the CONNECT @@ -2193,14 +2205,85 @@ func (as *mqttAccountSessionManager) removeSession(sess *mqttSession, lock bool) func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, filters []*mqttFilter, fromSubProto, trace bool) ([]*subscription, error) { - // Helpers to lock/unlock both account manager and session. - asAndSessLock := func() { - as.mu.Lock() - sess.mu.Lock() + // Helper to determine if we need to create a separate top-level + // subscription for a wildcard. + fwc := func(subject string) (bool, string, string) { + if !mqttNeedSubForLevelUp(subject) { + return false, _EMPTY_, _EMPTY_ + } + // Say subject is "foo.>", remove the ".>" so that it becomes "foo" + fwcsubject := subject[:len(subject)-2] + // Change the sid to "foo fwc" + fwcsid := fwcsubject + mqttMultiLevelSidSuffix + + return true, fwcsubject, fwcsid } - asAndSessUnlock := func() { - sess.mu.Unlock() - as.mu.Unlock() + + // Cache and a helper to load retained messages for a given subject. + rms := make(map[string]*mqttRetainedMsg) + loadRMS := func(subject []byte) error { + sub := &subscription{ + client: c, + subject: subject, + sid: subject, + } + c.mu.Lock() + acc := c.acc + c.mu.Unlock() + if err := c.addShadowSubscriptions(acc, sub, false); err != nil { + return err + } + // Best-effort loading the messages, logs on errors (to c.srv), loads + // once for subject. + as.loadRetainedMessagesForSubject(rms, subject, c.srv) + for _, ss := range sub.shadow { + as.loadRetainedMessagesForSubject(rms, ss.subject, c.srv) + } + return nil + } + + // Preload retained messages for all requested subscriptions. Also, since + // it's the first iteration over the filter list, do some cleanup. + for _, f := range filters { + if f.qos > 2 { + f.qos = 2 + } + if c.mqtt.downgradeQoS2Sub && f.qos == 2 { + c.Warnf("Downgrading subscription QoS2 to QoS1 for %q, as configured", f.filter) + f.qos = 1 + } + + // Do not allow subscribing to our internal subjects. + // + // TODO: (levb: not sure why since one can subscribe to `#` and it'll + // include everything; I guess this would discourage? Otherwise another + // candidate for DO NOT DELIVER prefix list). + if strings.HasPrefix(f.filter, mqttSubPrefix) { + f.qos = mqttSubAckFailure + continue + } + + if f.qos == 2 { + if err := sess.ensurePubRelConsumerSubscription(c); err != nil { + c.Errorf("failed to initialize PUBREL processing: %v", err) + f.qos = mqttSubAckFailure + continue + } + } + + // Load retained messages. + if fromSubProto { + if err := loadRMS([]byte(f.filter)); err != nil { + f.qos = mqttSubAckFailure + continue + } + if need, subject, _ := fwc(f.filter); need { + if err := loadRMS([]byte(subject)); err != nil { + f.qos = mqttSubAckFailure + continue + } + } + } } // Small helper to add the consumer config to the session. @@ -2214,90 +2297,80 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, sess.cons[sid] = cc } - // Helper that sets the sub's mqtt fields and possibly serialize retained messages. - // Assumes account manager and session lock held. - setupSub := func(sub *subscription, qos byte) { - subs := []*subscription{sub} - if len(sub.shadow) > 0 { - subs = append(subs, sub.shadow...) - } - for _, sub := range subs { - if sub.mqtt == nil { - sub.mqtt = &mqttSub{} - } - sub.mqtt.qos = qos - sub.mqtt.reserved = isMQTTReservedSubscription(string(sub.subject)) - if fromSubProto { - as.serializeRetainedMsgsForSub(sess, c, sub, trace) - } + serializeRMS := func(sub *subscription) { + for _, ss := range append([]*subscription{sub}, sub.shadow...) { + as.serializeRetainedMsgsForSub(rms, sess, c, ss, trace) } } var err error subs := make([]*subscription, 0, len(filters)) for _, f := range filters { - if f.qos > 2 { - f.qos = 2 - } - if c.mqtt.downgradeQoS2Sub && f.qos == 2 { - c.Warnf("Downgrading subscription QoS2 to QoS1 for %q, as configured", f.filter) - f.qos = 1 - } - subject := f.filter - sid := subject - if strings.HasPrefix(subject, mqttSubPrefix) { - f.qos = mqttSubAckFailure + // Skip what's already been identified as a failure. + if f.qos == mqttSubAckFailure { continue } + subject := f.filter + bsubject := []byte(subject) + sid := subject + bsid := bsubject var jscons *ConsumerConfig var jssub *subscription - // Note that if a subscription already exists on this subject, - // the existing sub is returned. Need to update the qos. - asAndSessLock() - sub, err := c.processSub([]byte(subject), nil, []byte(sid), mqttDeliverMsgCbQoS0, false) - if err == nil { - setupSub(sub, f.qos) - } - if f.qos == 2 { - err = sess.ensurePubRelConsumerSubscription(c) - } - asAndSessUnlock() - - if err == nil { - // This will create (if not already exist) a JS consumer for subscriptions - // of QoS >= 1. But if a JS consumer already exists and the subscription - // for same subject is now a QoS==0, then the JS consumer will be deleted. - jscons, jssub, err = sess.processJSConsumer(c, subject, sid, f.qos, fromSubProto) + // Note that if a subscription already exists on this subject, the + // existing sub is returned. Need to update the qos. + as.mu.Lock() + sess.mu.Lock() + sub, err := sess.processSub(c, bsubject, bsid, + isMQTTReservedSubscription(subject), f.qos, _EMPTY_, mqttDeliverMsgCbQoS0, true) + if err == nil && fromSubProto { + serializeRMS(sub) } + sess.mu.Unlock() + as.mu.Unlock() if err != nil { - // c.processSub already called c.Errorf(), so no need here. f.qos = mqttSubAckFailure sess.cleanupFailedSub(c, sub, jscons, jssub) continue } - if mqttNeedSubForLevelUp(subject) { + // This will create (if not already exist) a JS consumer for + // subscriptions of QoS >= 1. But if a JS consumer already exists and + // the subscription for same subject is now a QoS==0, then the JS + // consumer will be deleted. + jscons, jssub, err = sess.processJSConsumer(c, subject, sid, f.qos, fromSubProto) + if err != nil { + f.qos = mqttSubAckFailure + sess.cleanupFailedSub(c, sub, jscons, jssub) + continue + } + + // Process the wildcard subject if needed. + if need, fwcsubject, fwcsid := fwc(subject); need { var fwjscons *ConsumerConfig var fwjssub *subscription var fwcsub *subscription - // Say subject is "foo.>", remove the ".>" so that it becomes "foo" - fwcsubject := subject[:len(subject)-2] - // Change the sid to "foo fwc" - fwcsid := fwcsubject + mqttMultiLevelSidSuffix // See note above about existing subscription. - asAndSessLock() - fwcsub, err = c.processSub([]byte(fwcsubject), nil, []byte(fwcsid), mqttDeliverMsgCbQoS0, false) - if err == nil { - setupSub(fwcsub, f.qos) + as.mu.Lock() + sess.mu.Lock() + fwcsub, err = sess.processSub(c, []byte(fwcsubject), []byte(fwcsid), + isMQTTReservedSubscription(subject), f.qos, _EMPTY_, mqttDeliverMsgCbQoS0, true) + if err == nil && fromSubProto { + serializeRMS(fwcsub) } - asAndSessUnlock() - if err == nil { - fwjscons, fwjssub, err = sess.processJSConsumer(c, fwcsubject, fwcsid, f.qos, fromSubProto) + sess.mu.Unlock() + as.mu.Unlock() + if err != nil { + // c.processSub already called c.Errorf(), so no need here. + f.qos = mqttSubAckFailure + sess.cleanupFailedSub(c, sub, jscons, jssub) + continue } + + fwjscons, fwjssub, err = sess.processJSConsumer(c, fwcsubject, fwcsid, f.qos, fromSubProto) if err != nil { // c.processSub already called c.Errorf(), so no need here. f.qos = mqttSubAckFailure @@ -2305,6 +2378,7 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, sess.cleanupFailedSub(c, fwcsub, fwjscons, fwjssub) continue } + subs = append(subs, fwcsub) addJSConsToSess(fwcsid, fwjscons) } @@ -2328,15 +2402,19 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, // Runs from the client's readLoop. // Account session manager lock held on entry. // Session lock held on entry. -func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(sess *mqttSession, c *client, sub *subscription, trace bool) { - if len(as.retmsgs) == 0 { +func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(rms map[string]*mqttRetainedMsg, sess *mqttSession, c *client, sub *subscription, trace bool) { + if len(as.retmsgs) == 0 || len(rms) == 0 { return } - var rmsa [64]*mqttRetainedMsg - rms := rmsa[:0] - - as.getRetainedPublishMsgs(string(sub.subject), &rms) - for _, rm := range rms { + result := as.sl.ReverseMatch(string(sub.subject)) + if len(result.psubs) == 0 { + return + } + for _, psub := range result.psubs { + rm, ok := rms[string(psub.subject)] + if !ok { + continue + } if sub.mqtt.prm == nil { sub.mqtt.prm = &mqttWriter{} } @@ -2379,23 +2457,36 @@ func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(sess *mqttSessi // Returns in the provided slice all publish retained message records that // match the given subscription's `subject` (which could have wildcards). // -// Account session manager lock held on entry. -func (as *mqttAccountSessionManager) getRetainedPublishMsgs(subject string, rms *[]*mqttRetainedMsg) { - result := as.sl.ReverseMatch(subject) +// Account session manager NOT lock held on entry. +func (as *mqttAccountSessionManager) loadRetainedMessagesForSubject(rms map[string]*mqttRetainedMsg, topSubject []byte, log Logger) { + as.mu.RLock() + if len(as.retmsgs) == 0 { + as.mu.RUnlock() + return + } + result := as.sl.ReverseMatch(string(topSubject)) + as.mu.RUnlock() + if len(result.psubs) == 0 { return } for _, sub := range result.psubs { - subj := mqttRetainedMsgsStreamSubject + string(sub.subject) - jsm, err := as.jsa.loadLastMsgFor(mqttRetainedMsgsStreamName, subj) + subject := string(sub.subject) + if rms[subject] != nil { + continue // already loaded + } + loadSubject := mqttRetainedMsgsStreamSubject + subject + jsm, err := as.jsa.loadLastMsgFor(mqttRetainedMsgsStreamName, loadSubject) if err != nil || jsm == nil { + log.Warnf("failed to load retained message for subject %q: %v", loadSubject, err) continue } var rm mqttRetainedMsg if err := json.Unmarshal(jsm.Data, &rm); err != nil { + log.Warnf("failed to decode retained message for subject %q: %v", loadSubject, err) continue } - *rms = append(*rms, &rm) + rms[subject] = &rm } } @@ -2417,8 +2508,7 @@ func (as *mqttAccountSessionManager) createOrRestoreSession(clientID string, opt } hash := getHash(clientID) - subject := mqttSessStreamSubjectPrefix + as.domainTk + hash - smsg, err := jsa.loadLastMsgFor(mqttSessStreamName, subject) + smsg, err := jsa.loadSessionMsg(as.domainTk, hash) if err != nil { if isErrorOtherThan(err, JSNoMessageFoundErr) { return formatError("loading session record", err) @@ -2434,6 +2524,7 @@ func (as *mqttAccountSessionManager) createOrRestoreSession(clientID string, opt if err := json.Unmarshal(smsg.Data, ps); err != nil { return formatError(fmt.Sprintf("unmarshal of session record at sequence %v", smsg.Sequence), err) } + // Restore this session (even if we don't own it), the caller will do the right thing. sess := mqttSessionCreate(jsa, clientID, hash, smsg.Sequence, opts) sess.domainTk = as.domainTk @@ -2479,7 +2570,7 @@ func (as *mqttAccountSessionManager) transferUniqueSessStreamsToMuxed(log *Serve }() jsa := &as.jsa - sni, err := jsa.newRequestEx(mqttJSAStreamNames, JSApiStreams, 0, nil, 5*time.Second) + sni, err := jsa.newRequestEx(mqttJSAStreamNames, JSApiStreams, _EMPTY_, 0, nil, 5*time.Second) if err != nil { log.Errorf("Unable to transfer MQTT session streams: %v", err) return @@ -2514,10 +2605,8 @@ func (as *mqttAccountSessionManager) transferUniqueSessStreamsToMuxed(log *Serve log.Warnf(" Unable to unmarshal the content of this stream, may not be a legitimate MQTT session stream, skipping") continue } - // Compute subject where the session is being stored - subject := mqttSessStreamSubjectPrefix + as.domainTk + getHash(ps.ID) // Store record to MQTT session stream - if _, err := jsa.storeMsgWithKind(mqttJSASessPersist, subject, 0, smsg.Data); err != nil { + if _, err := jsa.storeSessionMsg(as.domainTk, getHash(ps.ID), 0, smsg.Data); err != nil { log.Errorf(" Unable to transfer the session record: %v", err) return } @@ -2553,7 +2642,8 @@ func (as *mqttAccountSessionManager) transferRetainedToPerKeySubjectStream(log * } // Store the message again, this time with the new per-key subject. subject := mqttRetainedMsgsStreamSubject + rmsg.Subject - if _, err := jsa.storeMsgWithKind(mqttJSASessPersist, subject, 0, smsg.Data); err != nil { + + if _, err := jsa.storeMsg(subject, 0, smsg.Data); err != nil { log.Errorf(" Unable to transfer the retained message with sequence %d: %v", smsg.Sequence, err) errors++ continue @@ -2619,7 +2709,7 @@ func (sess *mqttSession) save() error { } b, _ := json.Marshal(&ps) - subject := mqttSessStreamSubjectPrefix + sess.domainTk + sess.idHash + domainTk, cidHash := sess.domainTk, sess.idHash seq := sess.seq sess.mu.Unlock() @@ -2637,7 +2727,7 @@ func (sess *mqttSession) save() error { b = bb.Bytes() } - resp, err := sess.jsa.storeMsgWithKind(mqttJSASessPersist, subject, hdr, b) + resp, err := sess.jsa.storeSessionMsg(domainTk, cidHash, hdr, b) if err != nil { return fmt.Errorf("unable to persist session %q (seq=%v): %v", ps.ID, seq, err) } @@ -2691,8 +2781,13 @@ func (sess *mqttSession) clear() error { } if seq > 0 { - if err := sess.jsa.deleteMsg(mqttSessStreamName, seq, true); err != nil { - return fmt.Errorf("unable to delete session %q record at sequence %v", id, seq) + err := sess.jsa.deleteMsg(mqttSessStreamName, seq, true) + // Ignore the various errors indicating that the message (or sequence) + // is already deleted, can happen in a cluster. + if isErrorOtherThan(err, JSSequenceNotFoundErrF) { + if isErrorOtherThan(err, JSStreamMsgDeleteFailedF) || !strings.Contains(err.Error(), ErrStoreMsgNotFound.Error()) { + return fmt.Errorf("unable to delete session %q record at sequence %v: %v", id, seq, err) + } } } return nil @@ -3149,6 +3244,9 @@ func (c *client) mqttConnectTrace(cp *mqttConnectProto) string { trace += fmt.Sprintf(" will=(topic=%s QoS=%v retain=%v)", cp.will.topic, cp.will.qos, cp.will.retain) } + if cp.flags&mqttConnFlagCleanSession != 0 { + trace += " clean" + } if c.opts.Username != _EMPTY_ { trace += fmt.Sprintf(" username=%s", c.opts.Username) } @@ -4349,11 +4447,11 @@ func mqttIsReservedSub(sub *subscription, subject string) bool { // Check if a sub is a reserved wildcard. E.g. '#', '*', or '*/" prefix. func isMQTTReservedSubscription(subject string) bool { - if len(subject) == 1 && subject[0] == fwc || subject[0] == pwc { + if len(subject) == 1 && (subject[0] == fwc || subject[0] == pwc) { return true } // Match "*.<>" - if len(subject) > 1 && subject[0] == pwc && subject[1] == btsep { + if len(subject) > 1 && (subject[0] == pwc && subject[1] == btsep) { return true } return false @@ -4467,9 +4565,6 @@ func (sess *mqttSession) cleanupFailedSub(c *client, sub *subscription, cc *Cons // Make sure we are set up to deliver PUBREL messages to this QoS2-subscribed // session. -// -// Session lock held on entry. Need to make sure no other subscribe packet races -// to do the same. func (sess *mqttSession) ensurePubRelConsumerSubscription(c *client) error { opts := c.srv.getOpts() ackWait := opts.MQTT.AckWait @@ -4481,21 +4576,32 @@ func (sess *mqttSession) ensurePubRelConsumerSubscription(c *client) error { maxAckPending = mqttDefaultMaxAckPending } + sess.mu.Lock() + pubRelSubscribed := sess.pubRelSubscribed + pubRelSubject := sess.pubRelSubject + pubRelDeliverySubjectB := sess.pubRelDeliverySubjectB + pubRelDeliverySubject := sess.pubRelDeliverySubject + pubRelConsumer := sess.pubRelConsumer + tmaxack := sess.tmaxack + idHash := sess.idHash + id := sess.id + sess.mu.Unlock() + // Subscribe before the consumer is created so we don't loose any messages. - if !sess.pubRelSubscribed { - _, err := c.processSub(sess.pubRelDeliverySubjectB, nil, sess.pubRelDeliverySubjectB, + if !pubRelSubscribed { + _, err := c.processSub(pubRelDeliverySubjectB, nil, pubRelDeliverySubjectB, mqttDeliverPubRelCb, false) if err != nil { - c.Errorf("Unable to create subscription for JetStream consumer on %q: %v", sess.pubRelDeliverySubject, err) + c.Errorf("Unable to create subscription for JetStream consumer on %q: %v", pubRelDeliverySubject, err) return err } - sess.pubRelSubscribed = true + pubRelSubscribed = true } // Create the consumer if needed. - if sess.pubRelConsumer == nil { + if pubRelConsumer == nil { // Check that the limit of subs' maxAckPending are not going over the limit - if after := sess.tmaxack + maxAckPending; after > mqttMaxAckTotalLimit { + if after := tmaxack + maxAckPending; after > mqttMaxAckTotalLimit { return fmt.Errorf("max_ack_pending for all consumers would be %v which exceeds the limit of %v", after, mqttMaxAckTotalLimit) } @@ -4503,11 +4609,11 @@ func (sess *mqttSession) ensurePubRelConsumerSubscription(c *client) error { ccr := &CreateConsumerRequest{ Stream: mqttOutStreamName, Config: ConsumerConfig{ - DeliverSubject: sess.pubRelDeliverySubject, - Durable: mqttPubRelConsumerDurablePrefix + sess.idHash, + DeliverSubject: pubRelDeliverySubject, + Durable: mqttPubRelConsumerDurablePrefix + idHash, AckPolicy: AckExplicit, DeliverPolicy: DeliverNew, - FilterSubject: sess.pubRelSubject, + FilterSubject: pubRelSubject, AckWait: ackWait, MaxAckPending: maxAckPending, MemoryStorage: opts.MQTT.ConsumerMemoryStorage, @@ -4517,28 +4623,41 @@ func (sess *mqttSession) ensurePubRelConsumerSubscription(c *client) error { ccr.Config.InactiveThreshold = opts.MQTT.ConsumerInactiveThreshold } if _, err := sess.jsa.createConsumer(ccr); err != nil { - c.Errorf("Unable to add JetStream consumer for PUBREL for client %q: err=%v", sess.id, err) + c.Errorf("Unable to add JetStream consumer for PUBREL for client %q: err=%v", id, err) return err } - sess.pubRelConsumer = &ccr.Config - sess.tmaxack += maxAckPending + pubRelConsumer = &ccr.Config + tmaxack += maxAckPending } + sess.mu.Lock() + sess.pubRelSubscribed = pubRelSubscribed + sess.pubRelConsumer = pubRelConsumer + sess.tmaxack = tmaxack + sess.mu.Unlock() + return nil } // When invoked with a QoS of 0, looks for an existing JS durable consumer for // the given sid and if one is found, delete the JS durable consumer and unsub // the NATS subscription on the delivery subject. +// // With a QoS > 0, creates or update the existing JS durable consumer along with // its NATS subscription on a delivery subject. // -// Lock not held on entry, but session is in the locked map. +// Session lock is acquired and released as needed. Session is in the locked +// map. func (sess *mqttSession) processJSConsumer(c *client, subject, sid string, qos byte, fromSubProto bool) (*ConsumerConfig, *subscription, error) { - // Check if we are already a JS consumer for this SID. + sess.mu.Lock() cc, exists := sess.cons[sid] + tmaxack := sess.tmaxack + idHash := sess.idHash + sess.mu.Unlock() + + // Check if we are already a JS consumer for this SID. if exists { // If current QoS is 0, it means that we need to delete the existing // one (that was QoS > 0) @@ -4547,7 +4666,11 @@ func (sess *mqttSession) processJSConsumer(c *client, subject, sid string, // the form: mqttSubPrefix + . It is also used as the sid // for the NATS subscription, so use that for the lookup. sub := c.subs[cc.DeliverSubject] + + sess.mu.Lock() delete(sess.cons, sid) + sess.mu.Unlock() + sess.deleteConsumer(cc) if sub != nil { c.processUnsub(sub.sid) @@ -4583,12 +4706,12 @@ func (sess *mqttSession) processJSConsumer(c *client, subject, sid string, } // Check that the limit of subs' maxAckPending are not going over the limit - if after := sess.tmaxack + maxAckPending; after > mqttMaxAckTotalLimit { + if after := tmaxack + maxAckPending; after > mqttMaxAckTotalLimit { return nil, nil, fmt.Errorf("max_ack_pending for all consumers would be %v which exceeds the limit of %v", after, mqttMaxAckTotalLimit) } - durName := sess.idHash + "_" + nuid.Next() + durName := idHash + "_" + nuid.Next() ccr := &CreateConsumerRequest{ Stream: mqttStreamName, Config: ConsumerConfig{ @@ -4610,25 +4733,22 @@ func (sess *mqttSession) processJSConsumer(c *client, subject, sid string, return nil, nil, err } cc = &ccr.Config - sess.tmaxack += maxAckPending + tmaxack += maxAckPending } + // This is an internal subscription on subject like "$MQTT.sub." that is setup // for the JS durable's deliver subject. sess.mu.Lock() - sub, err := c.processSub([]byte(inbox), nil, []byte(inbox), mqttDeliverMsgCbQoS12, false) + sess.tmaxack = tmaxack + sub, err := sess.processSub(c, []byte(inbox), []byte(inbox), + isMQTTReservedSubscription(subject), qos, cc.Durable, mqttDeliverMsgCbQoS12, false) + sess.mu.Unlock() + if err != nil { - sess.mu.Unlock() sess.deleteConsumer(cc) c.Errorf("Unable to create subscription for JetStream consumer on %q: %v", subject, err) return nil, nil, err } - if sub.mqtt == nil { - sub.mqtt = &mqttSub{} - } - sub.mqtt.qos = qos - sub.mqtt.jsDur = cc.Durable - sub.mqtt.reserved = isMQTTReservedSubscription(subject) - sess.mu.Unlock() return cc, sub, nil } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/raft.go b/vendor/github.com/nats-io/nats-server/v2/server/raft.go index b0f30786be..87bd00f94e 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/raft.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/raft.go @@ -103,7 +103,6 @@ const ( Follower RaftState = iota Leader Candidate - Observer Closed ) @@ -115,8 +114,6 @@ func (state RaftState) String() string { return "CANDIDATE" case Leader: return "LEADER" - case Observer: - return "OBSERVER" case Closed: return "CLOSED" } @@ -125,108 +122,105 @@ func (state RaftState) String() string { type raft struct { sync.RWMutex - created time.Time - group string - sd string - id string - wal WAL - wtype StorageType - track bool - werr error + + created time.Time // Time that the group was created + accName string // Account name of the asset this raft group is for + group string // Raft group + sd string // Store directory + id string // Node ID + + wal WAL // WAL store (filestore or memstore) + wtype StorageType // WAL type, e.g. FileStorage or MemoryStorage + track bool // + werr error // Last write error + state atomic.Int32 // RaftState - hh hash.Hash64 - snapfile string - csz int - qn int - peers map[string]*lps - removed map[string]struct{} - acks map[uint64]map[string]struct{} - pae map[uint64]*appendEntry - elect *time.Timer - active time.Time - llqrt time.Time - lsut time.Time - term uint64 // The current vote term - pterm uint64 // Previous term from the last snapshot - pindex uint64 // Previous index from the last snapshot - commit uint64 // Sequence number of the most recent commit - applied uint64 // Sequence number of the most recently applied commit - leader string // The ID of the leader - vote string - hash string - s *Server - c *client - js *jetStream - dflag bool - pleader bool - observer bool - extSt extensionState + hh hash.Hash64 // Highwayhash, used for snapshots + snapfile string // Snapshot filename - // Subjects for votes, updates, replays. - psubj string - rpsubj string - vsubj string - vreply string - asubj string - areply string + csz int // Cluster size + qn int // Number of nodes needed to establish quorum + peers map[string]*lps // Other peers in the Raft group - sq *sendq - aesub *subscription + removed map[string]struct{} // Peers that were removed from the group + acks map[uint64]map[string]struct{} // Append entry responses/acks, map of entry index -> peer ID + pae map[uint64]*appendEntry // Pending append entries - // Are we doing a leadership transfer. - lxfer bool + elect *time.Timer // Election timer, normally accessed via electTimer + active time.Time // Last activity time, i.e. for heartbeats + llqrt time.Time // Last quorum lost time + lsut time.Time // Last scale-up time - // For holding term and vote and peerstate to be written. - wtv []byte - wps []byte - wtvch chan struct{} - wpsch chan struct{} + term uint64 // The current vote term + pterm uint64 // Previous term from the last snapshot + pindex uint64 // Previous index from the last snapshot + commit uint64 // Sequence number of the most recent commit + applied uint64 // Sequence number of the most recently applied commit - // For when we need to catch up as a follower. - catchup *catchupState + leader string // The ID of the leader + vote string // Our current vote state + lxfer bool // Are we doing a leadership transfer? - // For leader or server catching up a follower. - progress map[string]*ipQueue[uint64] + s *Server // Reference to top-level server + c *client // Internal client for subscriptions + js *jetStream // JetStream, if running, to see if we are out of resources - // For when we have paused our applyC. - paused bool - hcommit uint64 - pobserver bool + dflag bool // Debug flag + pleader bool // Has the group ever had a leader? + observer bool // The node is observing, i.e. not participating in voting + extSt extensionState // Extension state - // Queues and Channels - prop *ipQueue[*Entry] - entry *ipQueue[*appendEntry] - resp *ipQueue[*appendEntryResponse] - apply *ipQueue[*CommittedEntry] - reqs *ipQueue[*voteRequest] - votes *ipQueue[*voteResponse] - stepdown *ipQueue[string] - leadc chan bool - quit chan struct{} + psubj string // Proposals subject + rpsubj string // Remove peers subject + vsubj string // Vote requests subject + vreply string // Vote responses subject + asubj string // Append entries subject + areply string // Append entries responses subject - // Account name of the asset this raft group is for - accName string + sq *sendq // Send queue for outbound RPC messages + aesub *subscription // Subscription for handleAppendEntry callbacks - // Random generator, used to generate inboxes for instance - prand *rand.Rand + wtv []byte // Term and vote to be written + wps []byte // Peer state to be written + wtvch chan struct{} // Signals when a term vote was just written, to kick file writer + wpsch chan struct{} // Signals when a peer state was just written, to kick file writer + + catchup *catchupState // For when we need to catch up as a follower. + progress map[string]*ipQueue[uint64] // For leader or server catching up a follower. + + paused bool // Whether or not applies are paused + hcommit uint64 // The commit at the time that applies were paused + pobserver bool // Whether we were an observer at the time that applies were paused + + prop *ipQueue[*Entry] // Proposals + entry *ipQueue[*appendEntry] // Append entries + resp *ipQueue[*appendEntryResponse] // Append entries responses + apply *ipQueue[*CommittedEntry] // Apply queue (committed entries to be passed to upper layer) + reqs *ipQueue[*voteRequest] // Vote requests + votes *ipQueue[*voteResponse] // Vote responses + stepdown *ipQueue[string] // Stepdown requests + leadc chan bool // Leader changes + quit chan struct{} // Raft group shutdown + + prand *rand.Rand // Random generator, used to generate inboxes for instance } // cacthupState structure that holds our subscription, and catchup term and index // as well as starting term and index and how many updates we have seen. type catchupState struct { - sub *subscription - cterm uint64 - cindex uint64 - pterm uint64 - pindex uint64 - active time.Time + sub *subscription // Subscription that catchup messages will arrive on + cterm uint64 // Catchup term + cindex uint64 // Catchup index + pterm uint64 // Starting term + pindex uint64 // Starting index + active time.Time // Last time we received a message for this catchup } // lps holds peer state of last time and last index replicated. type lps struct { - ts int64 - li uint64 - kp bool // marks as known peer. + ts int64 // Last timestamp + li uint64 // Last index replicated + kp bool // Known peer } const ( @@ -237,7 +231,6 @@ const ( hbIntervalDefault = 1 * time.Second lostQuorumIntervalDefault = hbIntervalDefault * 10 // 10 seconds lostQuorumCheckIntervalDefault = hbIntervalDefault * 10 // 10 seconds - ) var ( @@ -383,7 +376,6 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe track: cfg.Track, csz: ps.clusterSize, qn: ps.clusterSize/2 + 1, - hash: hash, peers: make(map[string]*lps), acks: make(map[uint64]map[string]struct{}), pae: make(map[uint64]*appendEntry), @@ -413,14 +405,19 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe n.dflag = true } + // Set up the highwayhash for the snapshots. key := sha256.Sum256([]byte(n.group)) n.hh, _ = highwayhash.New64(key[:]) + // If we have a term and vote file (tav.idx on the filesystem) then read in + // what we think the term and vote was. It's possible these are out of date + // so a catch-up may be required. if term, vote, err := n.readTermVote(); err == nil && term > 0 { n.term = term n.vote = vote } + // Make sure that the snapshots directory exists. if err := os.MkdirAll(filepath.Join(n.sd, snapshotsDir), 0750); err != nil { return nil, fmt.Errorf("could not create snapshots directory - %v", err) } @@ -433,6 +430,9 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe n.setupLastSnapshot() } + // Retrieve the stream state from the WAL. If there are pending append + // entries that were committed but not applied before we last shut down, + // we will try to replay them and process them here. var state StreamState n.wal.FastState(&state) if state.Msgs > 0 { @@ -444,6 +444,8 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe } } + // It looks like there are entries we have committed but not applied + // yet. Replay them. for index := state.FirstSeq; index <= state.LastSeq; index++ { ae, err := n.loadEntry(index) if err != nil { @@ -469,15 +471,18 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe // Make sure to track ourselves. n.peers[n.id] = &lps{time.Now().UnixNano(), 0, true} + // Track known peers for _, peer := range ps.knownPeers { - // Set these to 0 to start but mark as known peer. if peer != n.id { + // Set these to 0 to start but mark as known peer. n.peers[peer] = &lps{0, 0, true} } } - // Setup our internal subscriptions. + // Setup our internal subscriptions for proposals, votes and append entries. + // If we fail to do this for some reason then this is fatal — we cannot + // continue setting up or the Raft node may be partially/totally isolated. if err := n.createInternalSubs(); err != nil { n.shutdown(true) return nil, err @@ -486,18 +491,26 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe n.debug("Started") // Check if we need to start in observer mode due to lame duck status. + // This will stop us from taking on the leader role when we're about to + // shutdown anyway. if s.isLameDuckMode() { n.debug("Will start in observer mode due to lame duck status") n.SetObserver(true) } + // Set the election timer and lost quorum timers to now, so that we + // won't accidentally trigger either state without knowing the real state + // of the other nodes. n.Lock() n.resetElectionTimeout() n.llqrt = time.Now() n.Unlock() + // Register the Raft group. labels["group"] = n.group s.registerRaftNode(n.group, n) + + // Start the goroutines for the Raft state machine and the file writer. s.startGoRoutine(n.run, labels) s.startGoRoutine(n.fileWriter) @@ -529,7 +542,8 @@ func (s *Server) clusterNameForNode(node string) string { return _EMPTY_ } -// Server will track all raft nodes. +// Registers the Raft node with the server, as it will track all of the Raft +// nodes. func (s *Server) registerRaftNode(group string, n RaftNode) { s.rnMu.Lock() defer s.rnMu.Unlock() @@ -539,6 +553,7 @@ func (s *Server) registerRaftNode(group string, n RaftNode) { s.raftNodes[group] = n } +// Unregisters the Raft node from the server, i.e. at shutdown. func (s *Server) unregisterRaftNode(group string) { s.rnMu.Lock() defer s.rnMu.Unlock() @@ -547,12 +562,15 @@ func (s *Server) unregisterRaftNode(group string) { } } +// Returns how many Raft nodes are running in this server instance. func (s *Server) numRaftNodes() int { s.rnMu.Lock() defer s.rnMu.Unlock() return len(s.raftNodes) } +// Finds the Raft node for a given Raft group, if any. If there is no Raft node +// running for this group then it can return nil. func (s *Server) lookupRaftNode(group string) RaftNode { s.rnMu.RLock() defer s.rnMu.RUnlock() @@ -563,6 +581,8 @@ func (s *Server) lookupRaftNode(group string) RaftNode { return n } +// Reloads the debug state for all running Raft nodes. This is necessary when +// the configuration has been reloaded and the debug log level has changed. func (s *Server) reloadDebugRaftNodes(debug bool) { if s == nil { return @@ -577,15 +597,19 @@ func (s *Server) reloadDebugRaftNodes(debug bool) { s.rnMu.RUnlock() } +// Requests that all Raft nodes on this server step down and place them into +// observer mode. This is called when the server is shutting down. func (s *Server) stepdownRaftNodes() { if s == nil { return } - var nodes []RaftNode s.rnMu.RLock() - if len(s.raftNodes) > 0 { - s.Debugf("Stepping down all leader raft nodes") + if len(s.raftNodes) == 0 { + s.rnMu.RUnlock() + return } + s.Debugf("Stepping down all leader raft nodes") + nodes := make([]RaftNode, 0, len(s.raftNodes)) for _, n := range s.raftNodes { nodes = append(nodes, n) } @@ -599,15 +623,20 @@ func (s *Server) stepdownRaftNodes() { } } +// Shuts down all Raft nodes on this server. This is called either when the +// server is either entering lame duck mode, shutting down or when JetStream +// has been disabled. func (s *Server) shutdownRaftNodes() { if s == nil { return } - var nodes []RaftNode s.rnMu.RLock() - if len(s.raftNodes) > 0 { - s.Debugf("Shutting down all raft nodes") + if len(s.raftNodes) == 0 { + s.rnMu.RUnlock() + return } + nodes := make([]RaftNode, 0, len(s.raftNodes)) + s.Debugf("Shutting down all raft nodes") for _, n := range s.raftNodes { nodes = append(nodes, n) } @@ -625,11 +654,12 @@ func (s *Server) transferRaftLeaders() bool { if s == nil { return false } - var nodes []RaftNode s.rnMu.RLock() - if len(s.raftNodes) > 0 { - s.Debugf("Transferring any raft leaders") + if len(s.raftNodes) == 0 { + s.rnMu.RUnlock() + return false } + nodes := make([]RaftNode, 0, len(s.raftNodes)) for _, n := range s.raftNodes { nodes = append(nodes, n) } @@ -668,7 +698,8 @@ func (n *raft) Propose(data []byte) error { return nil } -// ProposeDirect will propose entries directly. +// ProposeDirect will propose entries directly by skipping the Raft state +// machine and sending them straight to the wire instead. // This should only be called on the leader. func (n *raft) ProposeDirect(entries []*Entry) error { if state := n.State(); state != Leader { @@ -746,13 +777,16 @@ func (n *raft) ProposeRemovePeer(peer string) error { return werr } + // If we are the leader then we are responsible for processing the + // peer remove and then notifying the rest of the group that the + // peer was removed. if isLeader { prop.push(newEntry(EntryRemovePeer, []byte(peer))) n.doRemovePeerAsLeader(peer) return nil } - // Need to forward. + // Otherwise we need to forward the proposal to the leader. n.sendRPC(subj, _EMPTY_, []byte(peer)) return nil } @@ -779,7 +813,8 @@ func (n *raft) AdjustBootClusterSize(csz int) error { if csz < 2 { csz = 2 } - // Adjust. + // Adjust the cluster size and the number of nodes needed to establish + // a quorum. n.csz = csz n.qn = n.csz/2 + 1 @@ -798,7 +833,8 @@ func (n *raft) AdjustClusterSize(csz int) error { csz = 2 } - // Adjust. + // Adjust the cluster size and the number of nodes needed to establish + // a quorum. n.csz = csz n.qn = n.csz/2 + 1 n.Unlock() @@ -808,7 +844,8 @@ func (n *raft) AdjustClusterSize(csz int) error { } // PauseApply will allow us to pause processing of append entries onto our -// external apply chan. +// external apply queue. In effect this means that the upper layer will no longer +// receive any new entries from the Raft group. func (n *raft) PauseApply() error { if n.State() == Leader { return errAlreadyLeader @@ -832,6 +869,8 @@ func (n *raft) PauseApply() error { return nil } +// ResumeApply will resume sending applies to the external apply queue. This +// means that we will start sending new entries to the upper layer. func (n *raft) ResumeApply() { n.Lock() defer n.Unlock() @@ -862,8 +901,9 @@ func (n *raft) ResumeApply() { } } -// Applied is to be called when the FSM has applied the committed entries. -// Applied will return the number of entries and an estimation of the +// Applied is a callback that must be be called by the upper layer when it +// has successfully applied the committed entries that it received from the +// apply queue. It will return the number of entries and an estimation of the // byte size that could be removed with a snapshot/compact. func (n *raft) Applied(index uint64) (entries uint64, bytes uint64) { n.Lock() @@ -878,6 +918,9 @@ func (n *raft) Applied(index uint64) (entries uint64, bytes uint64) { if index > n.applied { n.applied = index } + + // Calculate the number of entries and estimate the byte size that + // we can now remove with a compaction/snapshot. var state StreamState n.wal.FastState(&state) if n.applied > state.FirstSeq { @@ -945,11 +988,14 @@ func (n *raft) InstallSnapshot(data []byte) error { n.Lock() + // If a write error has occurred already then stop here. if werr := n.werr; werr != nil { n.Unlock() return werr } + // Check that a catchup isn't already taking place. If it is then we won't + // allow installing snapshots until it is done. if len(n.progress) > 0 { n.Unlock() return errCatchupsRunning @@ -967,10 +1013,13 @@ func (n *raft) InstallSnapshot(data []byte) error { var term uint64 if ae, _ := n.loadEntry(n.applied); ae != nil { + // Use the term from the most recently applied entry if possible. term = ae.term } else if ae, _ = n.loadFirstEntry(); ae != nil { + // Otherwise see if we can find the term from the first entry. term = ae.term } else { + // Last resort is to use the last pterm that we knew of. term = n.pterm } @@ -1013,6 +1062,9 @@ func (n *raft) InstallSnapshot(data []byte) error { return nil } +// NeedSnapshot returns true if it is necessary to try to install a snapshot, i.e. +// after we have finished recovering/replaying at startup, on a regular interval or +// as a part of cleaning up when shutting down. func (n *raft) NeedSnapshot() bool { n.RLock() defer n.RUnlock() @@ -1024,6 +1076,8 @@ const ( snapFileT = "snap.%d.%d" ) +// termAndIndexFromSnapfile tries to load the snapshot file and returns the term +// and index from that snapshot. func termAndIndexFromSnapFile(sn string) (term, index uint64, err error) { if sn == _EMPTY_ { return 0, 0, errBadSnapName @@ -1035,6 +1089,9 @@ func termAndIndexFromSnapFile(sn string) (term, index uint64, err error) { return term, index, nil } +// setupLastSnapshot is called at startup to try and recover the last snapshot from +// the disk if possible. We will try to recover the term, index and commit/applied +// indices and then notify the upper layer what we found. Compacts the WAL if needed. func (n *raft) setupLastSnapshot() { snapDir := filepath.Join(n.sd, snapshotsDir) psnaps, err := os.ReadDir(snapDir) @@ -1084,19 +1141,25 @@ func (n *raft) setupLastSnapshot() { n.snapfile = latest snap, err := n.loadLastSnapshot() if err != nil { + // We failed to recover the last snapshot for some reason, so we will + // assume it has been corrupted and will try to delete it. if n.snapfile != _EMPTY_ { os.Remove(n.snapfile) n.snapfile = _EMPTY_ } - } else { - n.pindex = snap.lastIndex - n.pterm = snap.lastTerm - n.commit = snap.lastIndex - n.applied = snap.lastIndex - n.apply.push(newCommittedEntry(n.commit, []*Entry{{EntrySnapshot, snap.data}})) - if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil { - n.setWriteErrLocked(err) - } + return + } + + // We successfully recovered the last snapshot from the disk. + // Recover state from the snapshot and then notify the upper layer. + // Compact the WAL when we're done if needed. + n.pindex = snap.lastIndex + n.pterm = snap.lastTerm + n.commit = snap.lastIndex + n.applied = snap.lastIndex + n.apply.push(newCommittedEntry(n.commit, []*Entry{{EntrySnapshot, snap.data}})) + if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil { + n.setWriteErrLocked(err) } } @@ -1162,14 +1225,18 @@ func (n *raft) Leader() bool { return n.State() == Leader } +// isCatchingUp returns true if a catchup is currently taking place. func (n *raft) isCatchingUp() bool { n.RLock() defer n.RUnlock() return n.catchup != nil } -// This function may block for up to ~10ms to check -// forward progress in some cases. +// isCurrent is called from the healthchecks and returns true if we believe +// that the upper layer is current with the Raft layer, i.e. that it has applied +// all of the commits that we have given it. +// Optionally we can also check whether or not we're making forward progress if we +// aren't current, in which case this function may block for up to ~10ms to find out. // Lock should be held. func (n *raft) isCurrent(includeForwardProgress bool) bool { // Check if we are closed. @@ -1220,7 +1287,7 @@ func (n *raft) isCurrent(includeForwardProgress bool) bool { // Otherwise, wait for a short period of time and see if we are making any // forward progress. if startDelta := n.commit - n.applied; startDelta > 0 { - for i := 0; i < 10; i++ { // 5ms, in 0.5ms increments + for i := 0; i < 10; i++ { // 10ms, in 1ms increments n.Unlock() time.Sleep(time.Millisecond) n.Lock() @@ -1480,9 +1547,16 @@ func (n *raft) UpdateKnownPeers(knownPeers []string) { } } +// ApplyQ returns the apply queue that new commits will be sent to for the +// upper layer to apply. func (n *raft) ApplyQ() *ipQueue[*CommittedEntry] { return n.apply } -func (n *raft) LeadChangeC() <-chan bool { return n.leadc } -func (n *raft) QuitC() <-chan struct{} { return n.quit } + +// LeadChangeC returns the leader change channel, notifying when the Raft +// leader role has moved. +func (n *raft) LeadChangeC() <-chan bool { return n.leadc } + +// QuitC returns the quit channel, notifying when the Raft group has shut down. +func (n *raft) QuitC() <-chan struct{} { return n.quit } func (n *raft) Created() time.Time { n.RLock() @@ -1679,6 +1753,10 @@ func (n *raft) resetElectWithLock(et time.Duration) { n.Unlock() } +// run is the top-level runner for the Raft state machine. Depending on the +// state of the node (leader, follower, candidate, observer), this will call +// through to other functions. It is expected that this function will run for +// the entire life of the Raft node once started. func (n *raft) run() { s := n.s defer s.grWG.Done() @@ -1715,9 +1793,6 @@ func (n *raft) run() { n.runAsCandidate() case Leader: n.runAsLeader() - case Observer: - // TODO(dlc) - fix. - n.runAsFollower() case Closed: return } @@ -1765,7 +1840,8 @@ func (n *raft) setObserver(isObserver bool, extSt extensionState) { n.extSt = extSt } -// Invoked when being notified that there is something in the entryc's queue +// processAppendEntries is called by the Raft state machine when there are +// new append entries to be committed and sent to the upper state machine. func (n *raft) processAppendEntries() { canProcess := true if n.isClosed() { @@ -1776,7 +1852,8 @@ func (n *raft) processAppendEntries() { n.debug("AppendEntry not processing inbound, no resources") canProcess = false } - // Always pop the entries, but check if we can process them. + // Always pop the entries, but check if we can process them. If we can't + // then the entries are effectively dropped. aes := n.entry.pop() if canProcess { for _, ae := range aes { @@ -1786,19 +1863,25 @@ func (n *raft) processAppendEntries() { n.entry.recycle(&aes) } +// runAsFollower is called by run and will block for as long as the node is +// running in the follower state. func (n *raft) runAsFollower() { - for n.State() == Follower { + for { elect := n.electTimer() select { case <-n.entry.ch: + // New append entries have arrived over the network. n.processAppendEntries() case <-n.s.quitCh: + // The server is shutting down. n.shutdown(false) return case <-n.quit: + // The Raft node is shutting down. return case <-elect.C: + // The election timer has fired so we think it's time to call an election. // If we are out of resources we just want to stay in this state for the moment. if n.outOfResources() { n.resetElectionTimeoutWithLock() @@ -1820,17 +1903,23 @@ func (n *raft) runAsFollower() { return } case <-n.votes.ch: + // We're receiving votes from the network, probably because we have only + // just stepped down and they were already in flight. Ignore them. n.debug("Ignoring old vote response, we have stepped down") n.votes.popOne() case <-n.resp.ch: - // Ignore + // We're receiving append entry responses from the network, probably because + // we have only just stepped down and they were already in flight. Ignore them. n.resp.popOne() case <-n.reqs.ch: + // We've just received a vote request from the network. // Because of drain() it is possible that we get nil from popOne(). if voteReq, ok := n.reqs.popOne(); ok { n.processVoteRequest(voteReq) } case <-n.stepdown.ch: + // We've received a stepdown request, start following the new leader if + // we can. if newLeader, ok := n.stepdown.popOne(); ok { n.switchToFollower(newLeader) return @@ -1839,26 +1928,29 @@ func (n *raft) runAsFollower() { } } -// Pool for CommitedEntry re-use. +// Pool for CommittedEntry re-use. var cePool = sync.Pool{ New: func() any { return &CommittedEntry{} }, } -// CommitEntry is handed back to the user to apply a commit to their FSM. +// CommittedEntry is handed back to the user to apply a commit to their upper layer. type CommittedEntry struct { Index uint64 Entries []*Entry } -// Create a new ComittedEntry. +// Create a new CommittedEntry. When the returned entry is no longer needed, it +// should be returned to the pool by calling ReturnToPool. func newCommittedEntry(index uint64, entries []*Entry) *CommittedEntry { ce := cePool.Get().(*CommittedEntry) ce.Index, ce.Entries = index, entries return ce } +// ReturnToPool returns the CommittedEntry to the pool, after which point it is +// no longer safe to reuse. func (ce *CommittedEntry) ReturnToPool() { if ce == nil { return @@ -1879,7 +1971,8 @@ var entryPool = sync.Pool{ }, } -// Helper to create new entries. +// Helper to create new entries. When the returned entry is no longer needed, it +// should be returned to the entryPool pool. func newEntry(t EntryType, data []byte) *Entry { entry := entryPool.Get().(*Entry) entry.Type, entry.Data = t, data @@ -1895,15 +1988,15 @@ var aePool = sync.Pool{ // appendEntry is the main struct that is used to sync raft peers. type appendEntry struct { - leader string - term uint64 - commit uint64 - pterm uint64 - pindex uint64 - entries []*Entry - // internal use only. - reply string - sub *subscription + leader string // The leader that this append entry came from. + term uint64 // The current term, as the leader understands it. + commit uint64 // The commit index, as the leader understands it. + pterm uint64 // The previous term, for checking consistency. + pindex uint64 // The previous commit index, for checking consistency. + entries []*Entry // Entries to process. + // Below fields are for internal use only: + reply string // Reply subject to respond to once committed. + sub *subscription // The subscription that the append entry came in on. buf []byte } @@ -2166,12 +2259,15 @@ func (n *raft) runAsLeader() { // For forwarded proposals, both normal and remove peer proposals. fsub, err := n.subscribe(psubj, n.handleForwardedProposal) if err != nil { - n.debug("Error subscribing to forwarded proposals: %v", err) + n.warn("Error subscribing to forwarded proposals: %v", err) + n.stepdown.push(noLeader) return } rpsub, err := n.subscribe(rpsubj, n.handleForwardedRemovePeerProposal) if err != nil { - n.debug("Error subscribing to forwarded proposals: %v", err) + n.warn("Error subscribing to forwarded remove peer proposals: %v", err) + n.unsubscribe(fsub) + n.stepdown.push(noLeader) return } @@ -2537,7 +2633,7 @@ func (n *raft) loadEntry(index uint64) (*appendEntry, error) { return n.decodeAppendEntry(sm.msg, nil, _EMPTY_) } -// applyCommit will update our commit index and apply the entry to the apply chan. +// applyCommit will update our commit index and apply the entry to the apply queue. // lock should be held. func (n *raft) applyCommit(index uint64) error { if n.State() == Closed { @@ -2779,7 +2875,7 @@ func (n *raft) runAsCandidate() { // We vote for ourselves. votes := 1 - for n.State() == Candidate { + for { elect := n.electTimer() select { case <-n.entry.ch: @@ -2840,16 +2936,22 @@ func (n *raft) runAsCandidate() { } } -// handleAppendEntry handles an append entry from the wire. +// handleAppendEntry handles an append entry from the wire. This function +// is an internal callback from the "asubj" append entry subscription. func (n *raft) handleAppendEntry(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { msg = copyBytes(msg) if ae, err := n.decodeAppendEntry(msg, sub, reply); err == nil { + // Push to the new entry channel. From here one of the worker + // goroutines (runAsLeader, runAsFollower, runAsCandidate) will + // pick it up. n.entry.push(ae) } else { n.warn("AppendEntry failed to be placed on internal channel: corrupt entry") } } +// cancelCatchup will stop an in-flight catchup by unsubscribing from the +// catchup subscription. // Lock should be held. func (n *raft) cancelCatchup() { n.debug("Canceling catchup subscription since we are now up to date") @@ -2875,6 +2977,9 @@ func (n *raft) catchupStalled() bool { return false } +// createCatchup will create the state needed to track a catchup as it +// runs. It then creates a unique inbox for this catchup and subscribes +// to it. The remote side will stream entries to that subject. // Lock should be held. func (n *raft) createCatchup(ae *appendEntry) string { // Cleanup any old ones. @@ -2938,7 +3043,7 @@ func (n *raft) truncateWAL(term, index uint64) { n.term, n.pterm, n.pindex = term, term, index } -// Reset our WAL. +// Reset our WAL. This is equivalent to truncating all data from the log. // Lock should be held. func (n *raft) resetWAL() { n.truncateWAL(0, 0) @@ -2952,7 +3057,9 @@ func (n *raft) updateLeader(newLeader string) { } } -// processAppendEntry will process an appendEntry. +// processAppendEntry will process an appendEntry. This is called either +// during recovery or from processAppendEntries when there are new entries +// to be committed. func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { n.Lock() // Don't reset here if we have been asked to assume leader position. @@ -2991,7 +3098,9 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { return } - // If we received an append entry as a candidate we should convert to a follower. + // If we received an append entry as a candidate then it would appear that + // another node has taken on the leader role already, so we should convert + // to a follower of that node instead. if n.State() == Candidate { n.debug("Received append entry in candidate state from %q, converting to follower", ae.leader) if n.term < ae.term { @@ -3004,7 +3113,8 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { // Catching up state. catchingUp := n.catchup != nil - // Is this a new entry? + // Is this a new entry? New entries will be delivered on the append entry + // sub, rather than a catch-up sub. isNew := sub != nil && sub == n.aesub // Track leader directly @@ -3204,7 +3314,7 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { n.lxfer = true n.xferCampaign() } else if n.paused && !n.pobserver { - // Here we can become a leader but need to wait for resume of the apply channel. + // Here we can become a leader but need to wait for resume of the apply queue. n.lxfer = true } } else { @@ -3254,9 +3364,12 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { } } +// processPeerState is called when a peer state entry is received +// over the wire or when we're updating known peers. // Lock should be held. func (n *raft) processPeerState(ps *peerState) { - // Update our version of peers to that of the leader. + // Update our version of peers to that of the leader. Calculate + // the number of nodes needed to establish a quorum. n.csz = ps.clusterSize n.qn = n.csz/2 + 1 @@ -3274,15 +3387,19 @@ func (n *raft) processPeerState(ps *peerState) { n.writePeerState(ps) } -// Process a response. +// processAppendEntryResponse is called when we receive an append entry +// response from another node. They will send a confirmation to tell us +// whether they successfully committed the entry or not. func (n *raft) processAppendEntryResponse(ar *appendEntryResponse) { n.trackPeer(ar.peer) if ar.success { + // The remote node successfully committed the append entry. n.trackResponse(ar) arPool.Put(ar) } else if ar.term > n.term { - // False here and they have a higher term. + // The remote node didn't commit the append entry, it looks like + // they are on a newer term than we are. Step down. n.Lock() n.term = ar.term n.vote = noVote @@ -3293,6 +3410,8 @@ func (n *raft) processAppendEntryResponse(ar *appendEntryResponse) { n.Unlock() arPool.Put(ar) } else if ar.reply != _EMPTY_ { + // The remote node didn't commit the append entry and they are + // still on the same term, so let's try to catch them up. n.catchupFollower(ar) } } @@ -3308,7 +3427,8 @@ func (n *raft) buildAppendEntry(entries []*Entry) *appendEntry { return newAppendEntry(n.id, n.term, n.commit, n.pterm, n.pindex, entries) } -// Determine if we should store an entry. +// Determine if we should store an entry. This stops us from storing +// heartbeat messages. func (ae *appendEntry) shouldStore() bool { return ae != nil && len(ae.entries) > 0 } @@ -3621,6 +3741,7 @@ func (n *raft) fileWriter() { case <-n.quit: return case <-n.wtvch: + // We've been asked to write out the term-and-vote file. var buf [termVoteLen]byte n.RLock() copy(buf[0:], n.wtv) @@ -3633,6 +3754,7 @@ func (n *raft) fileWriter() { n.warn("Error writing term and vote file for %q: %v", n.group, err) } case <-n.wpsch: + // We've been asked to write out the peer state file. n.RLock() buf := copyBytes(n.wps) n.RUnlock() @@ -3656,7 +3778,7 @@ func (n *raft) writeTermVote() { copy(buf[8:], n.vote) b := buf[:8+len(n.vote)] - // If same as what we have we can ignore. + // If the term and vote hasn't changed then don't rewrite to disk. if bytes.Equal(n.wtv, b) { return } @@ -3734,7 +3856,8 @@ func (n *raft) processVoteRequest(vr *voteRequest) error { vresp := &voteResponse{n.term, n.id, false} defer n.debug("Sending a voteResponse %+v -> %q", vresp, vr.reply) - // Ignore if we are newer. + // Ignore if we are newer. This is important so that we don't accidentally process + // votes from a previous term if they were still in flight somewhere. if vr.term < n.term { n.Unlock() n.sendReply(vr.reply, vresp.encode()) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stream.go b/vendor/github.com/nats-io/nats-server/v2/server/stream.go index ea34bf55e7..118e8a95ed 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stream.go @@ -239,6 +239,7 @@ type stream struct { ddindex int ddtmr *time.Timer qch chan struct{} + mqch chan struct{} active bool ddloaded bool closed bool @@ -558,6 +559,7 @@ func (a *Account) addStreamWithAssignment(config *StreamConfig, fsConfig *FileSt msgs: newIPQueue[*inMsg](s, qpfx+"messages"), gets: newIPQueue[*directGetReq](s, qpfx+"direct gets"), qch: make(chan struct{}), + mqch: make(chan struct{}), uch: make(chan struct{}, 4), sch: make(chan struct{}, 1), } @@ -785,6 +787,15 @@ func (mset *stream) setStreamAssignment(sa *streamAssignment) { } } +func (mset *stream) monitorQuitC() <-chan struct{} { + if mset == nil { + return nil + } + mset.mu.RLock() + defer mset.mu.RUnlock() + return mset.mqch +} + func (mset *stream) updateC() <-chan struct{} { if mset == nil { return nil @@ -985,14 +996,6 @@ func (mset *stream) lastSeqAndCLFS() (uint64, uint64) { return mset.lseq, mset.getCLFS() } -func (mset *stream) clearCLFS() uint64 { - mset.clMu.Lock() - defer mset.clMu.Unlock() - clfs := mset.clfs - mset.clfs, mset.clseq = 0, 0 - return clfs -} - func (mset *stream) getCLFS() uint64 { mset.clMu.Lock() defer mset.clMu.Unlock() @@ -4077,6 +4080,7 @@ func (mset *stream) processInboundJetStreamMsg(_ *subscription, c *client, _ *Ac var ( errLastSeqMismatch = errors.New("last sequence mismatch") errMsgIdDuplicate = errors.New("msgid is duplicate") + errStreamClosed = errors.New("stream closed") ) // processJetStreamMsg is where we try to actually process the stream msg. @@ -4085,7 +4089,7 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, c, s, store := mset.client, mset.srv, mset.store if mset.closed || c == nil { mset.mu.Unlock() - return nil + return errStreamClosed } // Apply the input subject transform if any @@ -4415,7 +4419,6 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, // Make sure to take into account any message assignments that we had to skip (clfs). seq = lseq + 1 - clfs // Check for preAcks and the need to skip vs store. - if mset.hasAllPreAcks(seq, subject) { mset.clearAllPreAcks(seq) store.SkipMsg() @@ -4907,9 +4910,28 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { accName := jsa.account.Name jsa.mu.Unlock() - // Clean up consumers. + // Mark as closed, kick monitor and collect consumers first. mset.mu.Lock() mset.closed = true + // Signal to the monitor loop. + // Can't use qch here. + if mset.mqch != nil { + close(mset.mqch) + mset.mqch = nil + } + + // Stop responding to sync requests. + mset.stopClusterSubs() + // Unsubscribe from direct stream. + mset.unsubscribeToStream(true) + + // Our info sub if we spun it up. + if mset.infoSub != nil { + mset.srv.sysUnsubscribe(mset.infoSub) + mset.infoSub = nil + } + + // Clean up consumers. var obs []*consumer for _, o := range mset.consumers { obs = append(obs, o) @@ -4930,21 +4952,6 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { mset.cancelSourceConsumer(si.iname) } } - - // Cluster cleanup - var sa *streamAssignment - if n := mset.node; n != nil { - if deleteFlag { - n.Delete() - sa = mset.sa - } else { - if n.NeedSnapshot() { - // Attempt snapshot on clean exit. - n.InstallSnapshot(mset.stateSnapshotLocked()) - } - n.Stop() - } - } mset.mu.Unlock() isShuttingDown := js.isShuttingDown() @@ -4961,17 +4968,6 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { } mset.mu.Lock() - // Stop responding to sync requests. - mset.stopClusterSubs() - // Unsubscribe from direct stream. - mset.unsubscribeToStream(true) - - // Our info sub if we spun it up. - if mset.infoSub != nil { - mset.srv.sysUnsubscribe(mset.infoSub) - mset.infoSub = nil - } - // Send stream delete advisory after the consumers. if deleteFlag && advisory { mset.sendDeleteAdvisoryLocked() @@ -4983,11 +4979,17 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { mset.qch = nil } - c := mset.client - mset.client = nil - if c == nil { - mset.mu.Unlock() - return nil + // Cluster cleanup + var sa *streamAssignment + if n := mset.node; n != nil { + if deleteFlag { + n.Delete() + sa = mset.sa + } else { + // Always attempt snapshot on clean exit. + n.InstallSnapshot(mset.stateSnapshotLocked()) + n.Stop() + } } // Cleanup duplicate timer if running. @@ -5013,6 +5015,8 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { // Snapshot store. store := mset.store + c := mset.client + mset.client = nil // Clustered cleanup. mset.mu.Unlock() @@ -5027,7 +5031,9 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { js.mu.Unlock() } - c.closeConnection(ClientClosed) + if c != nil { + c.closeConnection(ClientClosed) + } if sysc != nil { sysc.closeConnection(ClientClosed) @@ -5042,9 +5048,12 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { js.releaseStreamResources(&mset.cfg) // cleanup directories after the stream accDir := filepath.Join(js.config.StoreDir, accName) - // no op if not empty - os.Remove(filepath.Join(accDir, streamsDir)) - os.Remove(accDir) + // Do cleanup in separate go routine similar to how fs will use purge here.. + go func() { + // no op if not empty + os.Remove(filepath.Join(accDir, streamsDir)) + os.Remove(accDir) + }() } else if store != nil { // Ignore errors. store.Stop() diff --git a/vendor/github.com/nats-io/nats-server/v2/server/websocket.go b/vendor/github.com/nats-io/nats-server/v2/server/websocket.go index 014a1d72fc..0f45f91e33 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/websocket.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/websocket.go @@ -693,9 +693,9 @@ func (s *Server) wsUpgrade(w http.ResponseWriter, r *http.Request) (*wsUpgradeRe kind := CLIENT if r.URL != nil { ep := r.URL.EscapedPath() - if strings.HasPrefix(ep, leafNodeWSPath) { + if strings.HasSuffix(ep, leafNodeWSPath) { kind = LEAF - } else if strings.HasPrefix(ep, mqttWSPath) { + } else if strings.HasSuffix(ep, mqttWSPath) { kind = MQTT } } diff --git a/vendor/modules.txt b/vendor/modules.txt index 67fd16bb81..130c2d0c5d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1371,10 +1371,10 @@ github.com/mohae/deepcopy # github.com/mschoch/smat v0.2.0 ## explicit; go 1.13 github.com/mschoch/smat -# github.com/nats-io/jwt/v2 v2.5.2 +# github.com/nats-io/jwt/v2 v2.5.3 ## explicit; go 1.18 github.com/nats-io/jwt/v2 -# github.com/nats-io/nats-server/v2 v2.10.4 +# github.com/nats-io/nats-server/v2 v2.10.5 ## explicit; go 1.20 github.com/nats-io/nats-server/v2/conf github.com/nats-io/nats-server/v2/internal/ldap @@ -2079,8 +2079,8 @@ golang.org/x/text/transform golang.org/x/text/unicode/bidi golang.org/x/text/unicode/norm golang.org/x/text/width -# golang.org/x/time v0.3.0 -## explicit +# golang.org/x/time v0.4.0 +## explicit; go 1.18 golang.org/x/time/rate # golang.org/x/tools v0.14.0 ## explicit; go 1.18