diff --git a/go.mod b/go.mod index cd9ef3452c..95019e1b5f 100644 --- a/go.mod +++ b/go.mod @@ -60,7 +60,7 @@ require ( github.com/mitchellh/mapstructure v1.5.0 github.com/mna/pigeon v1.2.1 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 - github.com/nats-io/nats-server/v2 v2.10.7 + github.com/nats-io/nats-server/v2 v2.10.9 github.com/nats-io/nats.go v1.31.0 github.com/oklog/run v1.1.0 github.com/olekukonko/tablewriter v0.0.5 @@ -277,7 +277,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mschoch/smat v0.2.0 // indirect github.com/nats-io/jwt/v2 v2.5.3 // indirect - github.com/nats-io/nkeys v0.4.6 // indirect + github.com/nats-io/nkeys v0.4.7 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/nxadm/tail v1.4.8 // indirect github.com/opencontainers/runtime-spec v1.1.0-rc.1 // indirect diff --git a/go.sum b/go.sum index 92dcb25c9d..50d6d455c9 100644 --- a/go.sum +++ b/go.sum @@ -1742,12 +1742,12 @@ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRW github.com/namedotcom/go v0.0.0-20180403034216-08470befbe04/go.mod h1:5sN+Lt1CaY4wsPvgQH/jsuJi4XO2ssZbdsIizr4CVC8= github.com/nats-io/jwt/v2 v2.5.3 h1:/9SWvzc6hTfamcgXJ3uYRpgj+QuY2aLNqRiqrKcrpEo= github.com/nats-io/jwt/v2 v2.5.3/go.mod h1:iysuPemFcc7p4IoYots3IuELSI4EDe9Y0bQMe+I3Bf4= -github.com/nats-io/nats-server/v2 v2.10.7 h1:f5VDy+GMu7JyuFA0Fef+6TfulfCs5nBTgq7MMkFJx5Y= -github.com/nats-io/nats-server/v2 v2.10.7/go.mod h1:V2JHOvPiPdtfDXTuEUsthUnCvSDeFrK4Xn9hRo6du7c= +github.com/nats-io/nats-server/v2 v2.10.9 h1:VEW43Zz+p+9lARtiPM9ctd6ckun+92ZT2T17HWtwiFI= +github.com/nats-io/nats-server/v2 v2.10.9/go.mod h1:oorGiV9j3BOLLO3ejQe+U7pfAGyPo+ppD7rpgNF6KTQ= github.com/nats-io/nats.go v1.31.0 h1:/WFBHEc/dOKBF6qf1TZhrdEfTmOZ5JzdJ+Y3m6Y/p7E= github.com/nats-io/nats.go v1.31.0/go.mod h1:di3Bm5MLsoB4Bx61CBTsxuarI36WbhAwOm8QrW39+i8= -github.com/nats-io/nkeys v0.4.6 h1:IzVe95ru2CT6ta874rt9saQRkWfe2nFj1NtvYSLqMzY= -github.com/nats-io/nkeys v0.4.6/go.mod h1:4DxZNzenSVd1cYQoAa8948QY3QDjrHfcfVADymtkpts= +github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI= +github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32/go.mod h1:9wM+0iRr9ahx58uYLpLIr5fm8diHn0JbqRycJi6w0Ms= diff --git a/vendor/github.com/nats-io/nats-server/v2/internal/fastrand/LICENSE b/vendor/github.com/nats-io/nats-server/v2/internal/fastrand/LICENSE new file mode 100644 index 0000000000..c12aa07bc1 --- /dev/null +++ b/vendor/github.com/nats-io/nats-server/v2/internal/fastrand/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The LevelDB-Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/vendor/github.com/nats-io/nats-server/v2/internal/fastrand/fastrand.go b/vendor/github.com/nats-io/nats-server/v2/internal/fastrand/fastrand.go new file mode 100644 index 0000000000..4795381292 --- /dev/null +++ b/vendor/github.com/nats-io/nats-server/v2/internal/fastrand/fastrand.go @@ -0,0 +1,23 @@ +// Copyright 2020-2023 The LevelDB-Go, Pebble and NATS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package fastrand + +import _ "unsafe" // required by go:linkname + +// Uint32 returns a lock free uint32 value. +// +//go:linkname Uint32 runtime.fastrand +func Uint32() uint32 + +// Uint32n returns a lock free uint32 value in the interval [0, n). +// +//go:linkname Uint32n runtime.fastrandn +func Uint32n(n uint32) uint32 + +// Uint32 returns a lock free uint64 value. +func Uint64() uint64 { + v := uint64(Uint32()) + return v<<32 | uint64(Uint32()) +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/accounts.go b/vendor/github.com/nats-io/nats-server/v2/server/accounts.go index c26c09a108..b45cec6f91 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/accounts.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/accounts.go @@ -18,7 +18,6 @@ import ( "encoding/hex" "errors" "fmt" - "hash/maphash" "io" "io/fs" "math" @@ -34,6 +33,7 @@ import ( "time" "github.com/nats-io/jwt/v2" + "github.com/nats-io/nats-server/v2/internal/fastrand" "github.com/nats-io/nkeys" "github.com/nats-io/nuid" ) @@ -89,7 +89,6 @@ type Account struct { srv *Server // server this account is registered with (possibly nil) lds string // loop detection subject for leaf nodes siReply []byte // service reply prefix, will form wildcard subscription. - prand *rand.Rand eventIds *nuid.NUID eventIdsMu sync.Mutex defaultPerms *Permissions @@ -290,9 +289,6 @@ func (a *Account) shallowCopy(na *Account) { } } na.mappings = a.mappings - if len(na.mappings) > 0 && na.prand == nil { - na.prand = rand.New(rand.NewSource(time.Now().UnixNano())) - } na.hasMapped.Store(len(na.mappings) > 0) // JetStream @@ -605,11 +601,6 @@ func (a *Account) AddWeightedMappings(src string, dests ...*MapDest) error { a.mu.Lock() defer a.mu.Unlock() - // We use this for selecting between multiple weighted destinations. - if a.prand == nil { - a.prand = rand.New(rand.NewSource(time.Now().UnixNano())) - } - if !IsValidSubject(src) { return ErrBadSubject } @@ -735,6 +726,18 @@ func (a *Account) RemoveMapping(src string) bool { a.mappings[len(a.mappings)-1] = nil // gc a.mappings = a.mappings[:len(a.mappings)-1] a.hasMapped.Store(len(a.mappings) > 0) + // If we have connected leafnodes make sure to update. + if a.nleafs > 0 { + // Need to release because lock ordering is client -> account + a.mu.Unlock() + // Now grab the leaf list lock. We can hold client lock under this one. + a.lmu.RLock() + for _, lc := range a.lleafs { + lc.forceRemoveFromSmap(src) + } + a.lmu.RUnlock() + a.mu.Lock() + } return true } } @@ -756,7 +759,7 @@ func (a *Account) selectMappedSubject(dest string) (string, bool) { return dest, false } - a.mu.RLock() + a.mu.Lock() // In case we have to tokenize for subset matching. tsa := [32]string{} tts := tsa[:0] @@ -787,7 +790,7 @@ func (a *Account) selectMappedSubject(dest string) (string, bool) { } if m == nil { - a.mu.RUnlock() + a.mu.Unlock() return dest, false } @@ -809,7 +812,7 @@ func (a *Account) selectMappedSubject(dest string) (string, bool) { if len(dests) == 1 && dests[0].weight == 100 { d = dests[0] } else { - w := uint8(a.prand.Int31n(100)) + w := uint8(fastrand.Uint32n(100)) for _, rm := range dests { if w < rm.weight { d = rm @@ -826,7 +829,7 @@ func (a *Account) selectMappedSubject(dest string) (string, bool) { } } - a.mu.RUnlock() + a.mu.Unlock() return ndest, true } @@ -2193,7 +2196,7 @@ func (a *Account) processServiceImportResponse(sub *subscription, c *client, _ * // Lock should be held. func (a *Account) createRespWildcard() { var b = [baseServerLen]byte{'_', 'R', '_', '.'} - rn := a.prand.Uint64() + rn := fastrand.Uint64() for i, l := replyPrefixLen, rn; i < len(b); i++ { b[i] = digits[l%base] l /= base @@ -2212,12 +2215,7 @@ func isTrackedReply(reply []byte) bool { func (a *Account) newServiceReply(tracking bool) []byte { a.mu.Lock() s := a.srv - if a.prand == nil { - var h maphash.Hash - h.WriteString(nuid.Next()) - a.prand = rand.New(rand.NewSource(int64(h.Sum64()))) - } - rn := a.prand.Uint64() + rn := fastrand.Uint64() // Check if we need to create the reply here. var createdSiReply bool diff --git a/vendor/github.com/nats-io/nats-server/v2/server/auth.go b/vendor/github.com/nats-io/nats-server/v2/server/auth.go index 7a0f93e217..b37d245ec4 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/auth.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/auth.go @@ -1,4 +1,4 @@ -// Copyright 2012-2023 The NATS Authors +// Copyright 2012-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -743,13 +743,24 @@ func (s *Server) processClientOrLeafAuthentication(c *client, opts *Options) (au // Check if we have nkeys or users for client. hasNkeys := len(s.nkeys) > 0 hasUsers := len(s.users) > 0 - if hasNkeys && c.opts.Nkey != _EMPTY_ { - nkey, ok = s.nkeys[c.opts.Nkey] - if !ok || !c.connectionTypeAllowed(nkey.AllowedConnectionTypes) { - s.mu.Unlock() - return false + if hasNkeys { + if (c.kind == CLIENT || c.kind == LEAF) && noAuthUser != _EMPTY_ && + c.opts.Username == _EMPTY_ && c.opts.Password == _EMPTY_ && c.opts.Token == _EMPTY_ && c.opts.Nkey == _EMPTY_ { + if _, exists := s.nkeys[noAuthUser]; exists { + c.mu.Lock() + c.opts.Nkey = noAuthUser + c.mu.Unlock() + } } - } else if hasUsers { + if c.opts.Nkey != _EMPTY_ { + nkey, ok = s.nkeys[c.opts.Nkey] + if !ok || !c.connectionTypeAllowed(nkey.AllowedConnectionTypes) { + s.mu.Unlock() + return false + } + } + } + if hasUsers && nkey == nil { // Check if we are tls verify and are mapping users from the client_certificate. if tlsMap { authorized := checkClientTLSCertSubject(c, func(u string, certDN *ldap.DN, _ bool) (string, bool) { @@ -989,27 +1000,30 @@ func (s *Server) processClientOrLeafAuthentication(c *client, opts *Options) (au } if nkey != nil { - if c.opts.Sig == _EMPTY_ { - c.Debugf("Signature missing") - return false - } - sig, err := base64.RawURLEncoding.DecodeString(c.opts.Sig) - if err != nil { - // Allow fallback to normal base64. - sig, err = base64.StdEncoding.DecodeString(c.opts.Sig) - if err != nil { - c.Debugf("Signature not valid base64") + // If we did not match noAuthUser check signature which is required. + if nkey.Nkey != noAuthUser { + if c.opts.Sig == _EMPTY_ { + c.Debugf("Signature missing") + return false + } + sig, err := base64.RawURLEncoding.DecodeString(c.opts.Sig) + if err != nil { + // Allow fallback to normal base64. + sig, err = base64.StdEncoding.DecodeString(c.opts.Sig) + if err != nil { + c.Debugf("Signature not valid base64") + return false + } + } + pub, err := nkeys.FromPublicKey(c.opts.Nkey) + if err != nil { + c.Debugf("User nkey not valid: %v", err) + return false + } + if err := pub.Verify(c.nonce, sig); err != nil { + c.Debugf("Signature not verified") return false } - } - pub, err := nkeys.FromPublicKey(c.opts.Nkey) - if err != nil { - c.Debugf("User nkey not valid: %v", err) - return false - } - if err := pub.Verify(c.nonce, sig); err != nil { - c.Debugf("Signature not verified") - return false } if err := c.RegisterNkeyUser(nkey); err != nil { return false @@ -1308,6 +1322,33 @@ func (s *Server) isLeafNodeAuthorized(c *client) bool { // with that user (from the leafnode's authorization{} config). if opts.LeafNode.Username != _EMPTY_ { return isAuthorized(opts.LeafNode.Username, opts.LeafNode.Password, opts.LeafNode.Account) + } else if opts.LeafNode.Nkey != _EMPTY_ { + if c.opts.Nkey != opts.LeafNode.Nkey { + return false + } + if c.opts.Sig == _EMPTY_ { + c.Debugf("Signature missing") + return false + } + sig, err := base64.RawURLEncoding.DecodeString(c.opts.Sig) + if err != nil { + // Allow fallback to normal base64. + sig, err = base64.StdEncoding.DecodeString(c.opts.Sig) + if err != nil { + c.Debugf("Signature not valid base64") + return false + } + } + pub, err := nkeys.FromPublicKey(c.opts.Nkey) + if err != nil { + c.Debugf("User nkey not valid: %v", err) + return false + } + if err := pub.Verify(c.nonce, sig); err != nil { + c.Debugf("Signature not verified") + return false + } + return s.registerLeafWithAccount(c, opts.LeafNode.Account) } else if len(opts.LeafNode.Users) > 0 { if opts.LeafNode.TLSMap { var user *User @@ -1425,15 +1466,21 @@ func validateNoAuthUser(o *Options, noAuthUser string) error { if len(o.TrustedOperators) > 0 { return fmt.Errorf("no_auth_user not compatible with Trusted Operator") } - if o.Users == nil { - return fmt.Errorf(`no_auth_user: "%s" present, but users are not defined`, noAuthUser) + + if o.Nkeys == nil && o.Users == nil { + return fmt.Errorf(`no_auth_user: "%s" present, but users/nkeys are not defined`, noAuthUser) } for _, u := range o.Users { if u.Username == noAuthUser { return nil } } + for _, u := range o.Nkeys { + if u.Nkey == noAuthUser { + return nil + } + } return fmt.Errorf( - `no_auth_user: "%s" not present as user in authorization block or account configuration`, + `no_auth_user: "%s" not present as user or nkey in authorization block or account configuration`, noAuthUser) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/certidp/certidp.go b/vendor/github.com/nats-io/nats-server/v2/server/certidp/certidp.go index f7b660dffa..e066ed43f0 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/certidp/certidp.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/certidp/certidp.go @@ -52,11 +52,24 @@ var ( } ) +// GetStatusAssertionStr returns the corresponding string representation of the StatusAssertion. func GetStatusAssertionStr(sa int) string { - return StatusAssertionValToStr[StatusAssertionIntToVal[sa]] + // If the provided status assertion value is not found in the map (StatusAssertionIntToVal), + // the function defaults to "unknown" to avoid defaulting to "good," which is the default iota value + // for the ocsp.StatusAssertion enumeration (https://pkg.go.dev/golang.org/x/crypto/ocsp#pkg-constants). + // This ensures that we don't unintentionally default to "good" when there's no map entry. + v, ok := StatusAssertionIntToVal[sa] + if !ok { + // set unknown as fallback + v = ocsp.Unknown + } + + return StatusAssertionValToStr[v] } func (sa StatusAssertion) MarshalJSON() ([]byte, error) { + // This ensures that we don't unintentionally default to "good" when there's no map entry. + // (see more details in the GetStatusAssertionStr() comment) str, ok := StatusAssertionValToStr[sa] if !ok { // set unknown as fallback @@ -66,6 +79,8 @@ func (sa StatusAssertion) MarshalJSON() ([]byte, error) { } func (sa *StatusAssertion) UnmarshalJSON(in []byte) error { + // This ensures that we don't unintentionally default to "good" when there's no map entry. + // (see more details in the GetStatusAssertionStr() comment) v, ok := StatusAssertionStrToVal[strings.ReplaceAll(string(in), "\"", "")] if !ok { // set unknown as fallback diff --git a/vendor/github.com/nats-io/nats-server/v2/server/client.go b/vendor/github.com/nats-io/nats-server/v2/server/client.go index 5fabe77c41..4ef424cbb0 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/client.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/client.go @@ -35,6 +35,7 @@ import ( "github.com/klauspost/compress/s2" "github.com/nats-io/jwt/v2" + "github.com/nats-io/nats-server/v2/internal/fastrand" ) // Type of client connection. @@ -442,8 +443,6 @@ type readCache struct { // to make sure to only send one message and properly scope to queues as needed. rts []routeTarget - prand *rand.Rand - // These are all temporary totals for an invocation of a read in readloop. msgs int32 bytes int32 @@ -4505,12 +4504,6 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, goto sendToRoutesOrLeafs } - // Check to see if we have our own rand yet. Global rand - // has contention with lots of clients, etc. - if c.in.prand == nil { - c.in.prand = rand.New(rand.NewSource(time.Now().UnixNano())) - } - // Process queue subs for i := 0; i < len(r.qsubs); i++ { qsubs := r.qsubs[i] @@ -4558,7 +4551,7 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, sindex := 0 lqs := len(qsubs) if lqs > 1 { - sindex = c.in.prand.Int() % lqs + sindex = int(fastrand.Uint32()) % lqs } // Find a subscription that is able to deliver this message starting at a random index. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/const.go b/vendor/github.com/nats-io/nats-server/v2/server/const.go index 39cc294cb7..dea5c9e9b7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/const.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/const.go @@ -41,7 +41,7 @@ var ( const ( // VERSION is the current version for the server. - VERSION = "2.10.7" + VERSION = "2.10.9" // PROTO is the currently supported protocol. // 0 was the original diff --git a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go index f71184bcfa..02a8443e7c 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go @@ -1,4 +1,4 @@ -// Copyright 2019-2023 The NATS Authors +// Copyright 2019-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -128,30 +128,37 @@ const ( ) const ( - actionUpdateString = "update" - actionCreateString = "create" - actionCreateOrUpdateString = "" + actionUpdateJSONString = `"update"` + actionCreateJSONString = `"create"` + actionCreateOrUpdateJSONString = `""` +) + +var ( + actionUpdateJSONBytes = []byte(actionUpdateJSONString) + actionCreateJSONBytes = []byte(actionCreateJSONString) + actionCreateOrUpdateJSONBytes = []byte(actionCreateOrUpdateJSONString) ) func (a ConsumerAction) String() string { switch a { case ActionCreateOrUpdate: - return actionCreateOrUpdateString + return actionCreateOrUpdateJSONString case ActionCreate: - return actionCreateString + return actionCreateJSONString case ActionUpdate: - return actionUpdateString + return actionUpdateJSONString } - return actionCreateOrUpdateString + return actionCreateOrUpdateJSONString } + func (a ConsumerAction) MarshalJSON() ([]byte, error) { switch a { case ActionCreate: - return json.Marshal(actionCreateString) + return actionCreateJSONBytes, nil case ActionUpdate: - return json.Marshal(actionUpdateString) + return actionUpdateJSONBytes, nil case ActionCreateOrUpdate: - return json.Marshal(actionCreateOrUpdateString) + return actionCreateOrUpdateJSONBytes, nil default: return nil, fmt.Errorf("can not marshal %v", a) } @@ -159,11 +166,11 @@ func (a ConsumerAction) MarshalJSON() ([]byte, error) { func (a *ConsumerAction) UnmarshalJSON(data []byte) error { switch string(data) { - case jsonString("create"): + case actionCreateJSONString: *a = ActionCreate - case jsonString("update"): + case actionUpdateJSONString: *a = ActionUpdate - case jsonString(""): + case actionCreateOrUpdateJSONString: *a = ActionCreateOrUpdate default: return fmt.Errorf("unknown consumer action: %v", string(data)) @@ -249,9 +256,9 @@ const ( func (r ReplayPolicy) String() string { switch r { case ReplayInstant: - return "instant" + return replayInstantPolicyJSONString default: - return "original" + return replayOriginalPolicyJSONString } } @@ -386,12 +393,13 @@ type consumer struct { // A single subject filter. type subjectFilter struct { - subject string - nextSeq uint64 - currentSeq uint64 - pmsg *jsPubMsg - err error - hasWildcard bool + subject string + nextSeq uint64 + currentSeq uint64 + pmsg *jsPubMsg + err error + hasWildcard bool + tokenizedSubject []string } type subjectFilters []*subjectFilter @@ -699,16 +707,16 @@ func (mset *stream) addConsumer(config *ConsumerConfig) (*consumer, error) { } func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname string, ca *consumerAssignment, isRecovering bool, action ConsumerAction) (*consumer, error) { - mset.mu.RLock() - s, jsa, tierName, cfg, acc, closed := mset.srv, mset.jsa, mset.tier, mset.cfg, mset.acc, mset.closed - retention := cfg.Retention - mset.mu.RUnlock() - // Check if this stream has closed. - if closed { + if mset.closed.Load() { return nil, NewJSStreamInvalidError() } + mset.mu.RLock() + s, jsa, tierName, cfg, acc := mset.srv, mset.jsa, mset.tier, mset.cfg, mset.acc + retention := cfg.Retention + mset.mu.RUnlock() + // If we do not have the consumer currently assigned to us in cluster mode we will proceed but warn. // This can happen on startup with restored state where on meta replay we still do not have // the assignment. Running in single server mode this always returns true. @@ -936,8 +944,9 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri subjects := gatherSubjectFilters(o.cfg.FilterSubject, o.cfg.FilterSubjects) for _, filter := range subjects { sub := &subjectFilter{ - subject: filter, - hasWildcard: subjectHasWildcard(filter), + subject: filter, + hasWildcard: subjectHasWildcard(filter), + tokenizedSubject: tokenizeSubjectIntoSlice(nil, filter), } o.subjf = append(o.subjf, sub) } @@ -1858,8 +1867,9 @@ func (o *consumer) updateConfig(cfg *ConsumerConfig) error { newSubjf := make(subjectFilters, 0, len(newSubjects)) for _, newFilter := range newSubjects { fs := &subjectFilter{ - subject: newFilter, - hasWildcard: subjectHasWildcard(newFilter), + subject: newFilter, + hasWildcard: subjectHasWildcard(newFilter), + tokenizedSubject: tokenizeSubjectIntoSlice(nil, newFilter), } // If given subject was present, we will retain its fields values // so `getNextMgs` can take advantage of already buffered `pmsgs`. @@ -3347,7 +3357,7 @@ func (o *consumer) notifyDeliveryExceeded(sseq, dc uint64) { o.sendAdvisory(o.deliveryExcEventT, j) } -// Check to see if the candidate subject matches a filter if its present. +// Check if the candidate subject matches a filter if its present. // Lock should be held. func (o *consumer) isFilteredMatch(subj string) bool { // No filter is automatic match. @@ -3361,9 +3371,29 @@ func (o *consumer) isFilteredMatch(subj string) bool { } // It's quicker to first check for non-wildcard filters, then // iterate again to check for subset match. - // TODO(dlc) at speed might be better to just do a sublist with L2 and/or possibly L1. + tsa := [32]string{} + tts := tokenizeSubjectIntoSlice(tsa[:0], subj) for _, filter := range o.subjf { - if subjectIsSubsetMatch(subj, filter.subject) { + if isSubsetMatchTokenized(tts, filter.tokenizedSubject) { + return true + } + } + return false +} + +// Check if the candidate filter subject is equal to or a subset match +// of one of the filter subjects. +// Lock should be held. +func (o *consumer) isEqualOrSubsetMatch(subj string) bool { + for _, filter := range o.subjf { + if !filter.hasWildcard && subj == filter.subject { + return true + } + } + tsa := [32]string{} + tts := tokenizeSubjectIntoSlice(tsa[:0], subj) + for _, filter := range o.subjf { + if isSubsetMatchTokenized(filter.tokenizedSubject, tts) { return true } } @@ -3945,8 +3975,10 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { } } else { if o.subjf != nil { + tsa := [32]string{} + tts := tokenizeSubjectIntoSlice(tsa[:0], pmsg.subj) for i, filter := range o.subjf { - if subjectIsSubsetMatch(pmsg.subj, filter.subject) { + if isSubsetMatchTokenized(tts, filter.tokenizedSubject) { o.subjf[i].currentSeq-- o.subjf[i].nextSeq-- break diff --git a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go index 573edb29d8..8b73d353de 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go @@ -1,4 +1,4 @@ -// Copyright 2019-2023 The NATS Authors +// Copyright 2019-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -290,7 +290,7 @@ const ( // Maximum size of a write buffer we may consider for re-use. maxBufReuse = 2 * 1024 * 1024 // default cache buffer expiration - defaultCacheBufferExpiration = 5 * time.Second + defaultCacheBufferExpiration = 2 * time.Second // default sync interval defaultSyncInterval = 2 * time.Minute // default idle timeout to close FDs. @@ -795,7 +795,7 @@ var blkPoolSmall sync.Pool // 2MB // Get a new msg block based on sz estimate. func getMsgBlockBuf(sz int) (buf []byte) { - var pb interface{} + var pb any if sz <= defaultSmallBlockSize { pb = blkPoolSmall.Get() } else if sz <= defaultMediumBlockSize { @@ -1481,7 +1481,6 @@ func (fs *fileStore) recoverFullState() (rerr error) { if _, err := os.Stat(pdir); err == nil { os.RemoveAll(pdir) } - // Grab our stream state file and load it in. fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) buf, err := os.ReadFile(fn) @@ -1590,6 +1589,13 @@ func (fs *fileStore) recoverFullState() (rerr error) { // We could reference the underlying buffer, but we could guess wrong if // number of blocks is large and subjects is low, since we would reference buf. subj := string(buf[bi : bi+lsubj]) + // We had a bug that could cause memory corruption in the PSIM that could have gotten stored to disk. + // Only would affect subjects, so do quick check. + if !isValidSubject(subj, true) { + os.Remove(fn) + fs.warn("Stream state corrupt subject detected") + return errCorruptState + } bi += lsubj psi := &psi{total: readU64(), fblk: uint32(readU64())} if psi.total > 1 { @@ -2024,6 +2030,7 @@ func (fs *fileStore) expireMsgsOnRecover() { } // If we are here we have to process the interior messages of this blk. + // This will load fss as well. if err := mb.loadMsgsWithLock(); err != nil { mb.mu.Unlock() break @@ -2033,7 +2040,6 @@ func (fs *fileStore) expireMsgsOnRecover() { var needNextFirst bool // Walk messages and remove if expired. - mb.ensurePerSubjectInfoLoaded() fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) for seq := fseq; seq <= lseq; seq++ { sm, err := mb.cacheLookup(seq, &smv) @@ -2196,13 +2202,8 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter} - if mb.cacheNotLoaded() { - if err := mb.loadMsgsWithLock(); err != nil { - return nil, false, err - } - if err := mb.ensurePerSubjectInfoLoaded(); err != nil { - return nil, false, err - } + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + return nil, false, err } // If we only have 1 subject currently and it matches our filter we can also set isAll. @@ -2250,6 +2251,15 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor return nil, false, ErrStoreMsgNotFound } + var didLoad bool + // Need messages loaded from here on out. + if mb.cacheNotLoaded() { + if err := mb.loadMsgsWithLock(); err != nil { + return nil, false, err + } + didLoad = true + } + if sm == nil { sm = new(StoreMsg) } @@ -2281,7 +2291,7 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor mb.llseq = llseq } - return nil, false, ErrStoreMsgNotFound + return nil, didLoad, ErrStoreMsgNotFound } // This will traverse a message block and generate the filtered pending. @@ -2533,7 +2543,7 @@ func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState { mb.mu.Lock() var shouldExpire bool - if mb.fss == nil { + if mb.fssNotLoaded() { // Make sure we have fss loaded. mb.loadMsgsWithLock() shouldExpire = true @@ -2581,8 +2591,7 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) } // Track starting for both block for the sseq and staring block that matches any subject. - var seqStart, subjStart int - + var seqStart int // See if we need to figure out starting block per sseq. if sseq > fs.state.FirstSeq { // This should not, but can return -1, so make sure we check to avoid panic below. @@ -2591,8 +2600,6 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) } } - var tsa, fsa [32]string - fts := tokenizeSubjectIntoSlice(fsa[:0], filter) isAll := filter == _EMPTY_ || filter == fwcs wc := subjectHasWildcard(filter) @@ -2602,13 +2609,16 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) } // If we are isAll and have no deleted we can do a simpler calculation. - if isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs { + if !lastPerSubject && isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs { if sseq == 0 { return fs.state.Msgs, validThrough } return fs.state.LastSeq - sseq + 1, validThrough } + var tsa, fsa [32]string + fts := tokenizeSubjectIntoSlice(fsa[:0], filter) + isMatch := func(subj string) bool { if isAll { return true @@ -2620,81 +2630,134 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) return isSubsetMatchTokenized(tts, fts) } + // Handle last by subject a bit differently. + // We will scan PSIM since we accurately track the last block we have seen the subject in. This + // allows us to only need to load at most one block now. + // For the last block, we need to track the subjects that we know are in that block, and track seen + // while in the block itself, but complexity there worth it. + if lastPerSubject { + // If we want all and our start sequence is equal or less than first return number of subjects. + if isAll && sseq <= fs.state.FirstSeq { + return uint64(len(fs.psim)), validThrough + } + // If we are here we need to scan. We are going to scan the PSIM looking for lblks that are >= seqStart. + // This will build up a list of all subjects from the selected block onward. + lbm := make(map[string]bool) + mb := fs.blks[seqStart] + bi := mb.index + + for subj, psi := range fs.psim { + // If the select blk start is greater than entry's last blk skip. + if bi > psi.lblk { + continue + } + if isMatch(subj) { + total++ + // We will track the subjects that are an exact match to the last block. + // This is needed for last block processing. + if psi.lblk == bi { + lbm[subj] = true + } + } + } + // Now check if we need to inspect the seqStart block. + // Grab write lock in case we need to load in msgs. + mb.mu.Lock() + var shouldExpire bool + // We need to walk this block to correct accounting from above. + if sseq > mb.first.seq { + // Track the ones we add back in case more than one. + seen := make(map[string]bool) + // We need to discount the total by subjects seen before sseq, but also add them right back in if they are >= sseq for this blk. + // This only should be subjects we know have the last blk in this block. + if mb.cacheNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } + var smv StoreMsg + for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { + sm, _ := mb.cacheLookup(seq, &smv) + if sm == nil || sm.subj == _EMPTY_ || !lbm[sm.subj] { + continue + } + if isMatch(sm.subj) { + // If less than sseq adjust off of total as long as this subject matched the last block. + if seq < sseq { + if !seen[sm.subj] { + total-- + seen[sm.subj] = true + } + } else if seen[sm.subj] { + // This is equal or more than sseq, so add back in. + total++ + // Make sure to not process anymore. + delete(seen, sm.subj) + } + } + } + } + // If we loaded the block try to force expire. + if shouldExpire { + mb.tryForceExpireCacheLocked() + } + mb.mu.Unlock() + return total, validThrough + } + // If we would need to scan more from the beginning, revert back to calculating directly here. // TODO(dlc) - Redo properly with sublists etc for subject-based filtering. - if lastPerSubject || seqStart >= (len(fs.blks)/2) { - // If we need to track seen for last per subject. - var seen map[string]bool - if lastPerSubject { - seen = make(map[string]bool) - } - + if seqStart >= (len(fs.blks) / 2) { for i := seqStart; i < len(fs.blks); i++ { + var shouldExpire bool mb := fs.blks[i] + // Hold write lock in case we need to load cache. mb.mu.Lock() var t uint64 if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) { - if lastPerSubject { - mb.ensurePerSubjectInfoLoaded() - for subj := range mb.fss { - if !seen[subj] { - total++ - seen[subj] = true - } - } - } else { - total += mb.msgs - } + total += mb.msgs mb.mu.Unlock() continue } - // If we are here we need to at least scan the subject fss. // Make sure we have fss loaded. - mb.ensurePerSubjectInfoLoaded() + if mb.fssNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } var havePartial bool for subj, ss := range mb.fss { - if !seen[subj] && isMatch(subj) { - if lastPerSubject { - // Can't have a partials with last by subject. - if sseq <= ss.Last { - t++ - seen[subj] = true - } - } else { - if ss.firstNeedsUpdate { - mb.recalculateFirstForSubj(subj, ss.First, ss) - } - if sseq <= ss.First { - t += ss.Msgs - } else if sseq <= ss.Last { - // We matched but its a partial. - havePartial = true - break - } + if isMatch(subj) { + if ss.firstNeedsUpdate { + mb.recalculateFirstForSubj(subj, ss.First, ss) + } + if sseq <= ss.First { + t += ss.Msgs + } else if sseq <= ss.Last { + // We matched but its a partial. + havePartial = true + break } } } // See if we need to scan msgs here. if havePartial { - // Clear on partial. - t = 0 - // If we load the cache for a linear scan we want to expire that cache upon exit. - var shouldExpire bool + // Make sure we have the cache loaded. if mb.cacheNotLoaded() { mb.loadMsgsWithLock() shouldExpire = true } + // Clear on partial. + t = 0 var smv StoreMsg for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { - if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && (isAll || isMatch(sm.subj)) { + if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && isMatch(sm.subj) { t++ } } - // If we loaded this block for this operation go ahead and expire it here. - if shouldExpire { - mb.tryForceExpireCacheLocked() - } + } + // If we loaded this block for this operation go ahead and expire it here. + if shouldExpire { + mb.tryForceExpireCacheLocked() } mb.mu.Unlock() total += t @@ -2702,24 +2765,15 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) return total, validThrough } - // If we are here its better to calculate totals from psim and adjust downward by scanning less blocks. + // If we are here it's better to calculate totals from psim and adjust downward by scanning less blocks. // TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead. start := uint32(math.MaxUint32) for subj, psi := range fs.psim { if isMatch(subj) { - if lastPerSubject { - total++ - // Keep track of start index for this subject. - // Use last block in this case. - if psi.lblk < start { - start = psi.lblk - } - } else { - total += psi.total - // Keep track of start index for this subject. - if psi.fblk < start { - start = psi.fblk - } + total += psi.total + // Keep track of start index for this subject. + if psi.fblk < start { + start = psi.fblk } } } @@ -2729,11 +2783,8 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) } // If we are here we need to calculate partials for the first blocks. - subjStart = int(start) - firstSubjBlk := fs.bim[uint32(subjStart)] + firstSubjBlk := fs.bim[start] var firstSubjBlkFound bool - var smv StoreMsg - // Adjust in case not found. if firstSubjBlk == nil { firstSubjBlkFound = true @@ -2741,62 +2792,53 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) // Track how many we need to adjust against the total. var adjust uint64 - for i := 0; i <= seqStart; i++ { mb := fs.blks[i] - // We can skip blks if we know they are below the first one that has any subject matches. if !firstSubjBlkFound { - if mb == firstSubjBlk { - firstSubjBlkFound = true - } else { + if firstSubjBlkFound = (mb == firstSubjBlk); !firstSubjBlkFound { continue } } - // We need to scan this block. var shouldExpire bool mb.mu.Lock() // Check if we should include all of this block in adjusting. If so work with metadata. if sseq > atomic.LoadUint64(&mb.last.seq) { - if isAll && !lastPerSubject { + if isAll { adjust += mb.msgs } else { // We need to adjust for all matches in this block. - // We will scan fss state vs messages themselves. - // Make sure we have fss loaded. - mb.ensurePerSubjectInfoLoaded() + // Make sure we have fss loaded. This loads whole block now. + if mb.cacheNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } for subj, ss := range mb.fss { if isMatch(subj) { - if lastPerSubject { - adjust++ - } else { - adjust += ss.Msgs - } + adjust += ss.Msgs } } } } else { // This is the last block. We need to scan per message here. if mb.cacheNotLoaded() { - if err := mb.loadMsgsWithLock(); err != nil { - mb.mu.Unlock() - return 0, 0 - } + mb.loadMsgsWithLock() shouldExpire = true } - var last = atomic.LoadUint64(&mb.last.seq) if sseq < last { last = sseq } + // We need to walk all messages in this block + var smv StoreMsg for seq := atomic.LoadUint64(&mb.first.seq); seq < last; seq++ { sm, _ := mb.cacheLookup(seq, &smv) - if sm == nil { + if sm == nil || sm.subj == _EMPTY_ { continue } // Check if it matches our filter. - if isMatch(sm.subj) && sm.seq < sseq { + if sm.seq < sseq && isMatch(sm.subj) { adjust++ } } @@ -3180,8 +3222,27 @@ func (fs *fileStore) SkipMsg() uint64 { fs.mu.Lock() defer fs.mu.Unlock() + // Grab our current last message block. + mb := fs.lmb + if mb == nil || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize { + if mb != nil && fs.fcfg.Compression != NoCompression { + // We've now reached the end of this message block, if we want + // to compress blocks then now's the time to do it. + go mb.recompressOnDiskIfNeeded() + } + var err error + if mb, err = fs.newMsgBlockForWrite(); err != nil { + return 0 + } + } + // Grab time and last seq. now, seq := time.Now().UTC(), fs.state.LastSeq+1 + + // Write skip msg. + mb.skipMsg(seq, now) + + // Update fs state. fs.state.LastSeq, fs.state.LastTime = seq, now if fs.state.Msgs == 0 { fs.state.FirstSeq, fs.state.FirstTime = seq, now @@ -3189,11 +3250,84 @@ func (fs *fileStore) SkipMsg() uint64 { if seq == fs.state.FirstSeq { fs.state.FirstSeq, fs.state.FirstTime = seq+1, now } - fs.lmb.skipMsg(seq, now) + // Mark as dirty for stream state. + fs.dirty++ return seq } +// Skip multiple msgs. We will determine if we can fit into current lmb or we need to create a new block. +func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error { + fs.mu.Lock() + defer fs.mu.Unlock() + + // Check sequence matches our last sequence. + if seq != fs.state.LastSeq+1 { + if seq > 0 { + return ErrSequenceMismatch + } + seq = fs.state.LastSeq + 1 + } + + // Limit number of dmap entries + const maxDeletes = 64 * 1024 + mb := fs.lmb + + numDeletes := int(num) + if mb != nil { + numDeletes += mb.dmap.Size() + } + if mb == nil || numDeletes > maxDeletes && mb.msgs > 0 || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize { + if mb != nil && fs.fcfg.Compression != NoCompression { + // We've now reached the end of this message block, if we want + // to compress blocks then now's the time to do it. + go mb.recompressOnDiskIfNeeded() + } + var err error + if mb, err = fs.newMsgBlockForWrite(); err != nil { + return err + } + } + + // Insert into dmap all entries and place last as marker. + now := time.Now().UTC() + nowts := now.UnixNano() + lseq := seq + num - 1 + + mb.mu.Lock() + var needsRecord bool + // If we are empty update meta directly. + if mb.msgs == 0 { + atomic.StoreUint64(&mb.last.seq, lseq) + mb.last.ts = nowts + atomic.StoreUint64(&mb.first.seq, lseq+1) + mb.first.ts = nowts + } else { + needsRecord = true + for ; seq <= lseq; seq++ { + mb.dmap.Insert(seq) + } + } + mb.mu.Unlock() + + // Write out our placeholder. + if needsRecord { + mb.writeMsgRecord(emptyRecordLen, lseq|ebit, _EMPTY_, nil, nil, nowts, true) + } + + // Now update FS accounting. + // Update fs state. + fs.state.LastSeq, fs.state.LastTime = lseq, now + if fs.state.Msgs == 0 { + fs.state.FirstSeq, fs.state.FirstTime = lseq+1, now + } + + // Mark as dirty for stream state. + fs.dirty++ + + return nil +} + // Lock should be held. func (fs *fileStore) rebuildFirst() { if len(fs.blks) == 0 { @@ -3238,9 +3372,14 @@ func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) { continue } mb.mu.Lock() - if err := mb.ensurePerSubjectInfoLoaded(); err != nil { - mb.mu.Unlock() - return 0, err + var shouldExpire bool + if mb.fssNotLoaded() { + // Make sure we have fss loaded. + if err := mb.loadMsgsWithLock(); err != nil { + mb.mu.Unlock() + return 0, err + } + shouldExpire = true } if ss := mb.fss[subj]; ss != nil { // Adjust first if it was not where we thought it should be. @@ -3255,6 +3394,11 @@ func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) { mb.mu.Unlock() return ss.First, nil } + // If we did not find it and we loaded this msgBlock try to expire as long as not the last. + if shouldExpire { + // Expire this cache before moving on. + mb.tryForceExpireCacheLocked() + } mb.mu.Unlock() } return 0, nil @@ -3361,9 +3505,6 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { } // Grab the ss entry for this subject in case sparse. mb.mu.Lock() - if mb.cacheNotLoaded() { - mb.loadMsgsWithLock() - } mb.ensurePerSubjectInfoLoaded() ss := mb.fss[subj] if ss != nil && ss.firstNeedsUpdate { @@ -4154,7 +4295,7 @@ func (fs *fileStore) selectNextFirst() { // Lock should be held. func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) { if td == 0 { - td = mb.cexp + td = mb.cexp + 100*time.Millisecond } if mb.ctmr == nil { mb.ctmr = time.AfterFunc(td, mb.expireCache) @@ -4249,7 +4390,7 @@ func (mb *msgBlock) tryExpireWriteCache() []byte { // Lock should be held. func (mb *msgBlock) expireCacheLocked() { - if mb.cache == nil && mb.fss == nil { + if mb.cache == nil { if mb.ctmr != nil { mb.ctmr.Stop() mb.ctmr = nil @@ -4289,9 +4430,8 @@ func (mb *msgBlock) expireCacheLocked() { mb.cache.wp = 0 } - // Check if we can clear out our fss and idx unless under force expire. - // We used to hold onto the idx longer but removes need buf now so no point. - mb.fss = nil + // Check if we can clear out our idx unless under force expire. + // fss we keep longer and expire under sync timer checks. mb.clearCache() } @@ -4680,9 +4820,7 @@ func (mb *msgBlock) recompressOnDiskIfNeeded() error { // Wait for disk I/O slots to become available. This prevents us from // running away with system resources. <-dios - defer func() { - dios <- struct{}{} - }() + defer func() { dios <- struct{}{} }() alg := mb.fs.fcfg.Compression mb.mu.Lock() @@ -4854,6 +4992,7 @@ func (fs *fileStore) syncBlocks() { } blks := append([]*msgBlock(nil), fs.blks...) lmb := fs.lmb + syncInterval := fs.fcfg.SyncInterval fs.mu.RUnlock() var markDirty bool @@ -4868,6 +5007,12 @@ func (fs *fileStore) syncBlocks() { if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle { mb.dirtyCloseWithRemove(false) } + // Check our fss subject metadata. + // If we have no activity within sync interval remove. + if mb.fssLoaded() && mb.sinceLastActivity() > syncInterval { + mb.fss = nil + } + // Check if we should compact here as well. // Do not compact last mb. var needsCompact bool @@ -5013,24 +5158,30 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { mbFirstSeq := atomic.LoadUint64(&mb.first.seq) + // Capture beginning size of dmap. + dms := uint64(mb.dmap.Size()) + idxSz := atomic.LoadUint64(&mb.last.seq) - mbFirstSeq + 1 + if mb.cache == nil { // Approximation, may adjust below. fseq = mbFirstSeq - idx = make([]uint32, 0, mb.msgs) + idx = make([]uint32, 0, idxSz) mb.cache = &cache{} } else { fseq = mb.cache.fseq idx = mb.cache.idx if len(idx) == 0 { - idx = make([]uint32, 0, mb.msgs) + idx = make([]uint32, 0, idxSz) } index = uint32(len(mb.cache.buf)) buf = append(mb.cache.buf, buf...) } // Create FSS if we should track. - if !mb.noTrack { + var popFss bool + if mb.fssNotLoaded() { mb.fss = make(map[string]*SimpleState) + popFss = true } lbuf := uint32(len(buf)) @@ -5070,7 +5221,9 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { // If we have a hole fill it. for dseq := mbFirstSeq + uint64(len(idx)); dseq < seq; dseq++ { idx = append(idx, dbit) - mb.dmap.Insert(dseq) + if dms == 0 { + mb.dmap.Insert(dseq) + } } } // Add to our index. @@ -5082,12 +5235,12 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error { } // Make sure our dmap has this entry if it was erased. - if erased { + if erased && dms == 0 { mb.dmap.Insert(seq) } // Handle FSS inline here. - if slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) { + if popFss && slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) { bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)] if ss := mb.fss[string(bsubj)]; ss != nil { ss.Msgs++ @@ -5277,17 +5430,41 @@ func (mb *msgBlock) cacheNotLoaded() bool { return !mb.cacheAlreadyLoaded() } +// Report if our fss is not loaded. +// Lock should be held. +func (mb *msgBlock) fssNotLoaded() bool { + return mb.fss == nil && !mb.noTrack +} + +// Report if we have our fss loaded. +// Lock should be held. +func (mb *msgBlock) fssLoaded() bool { + return mb.fss != nil +} + // Used to load in the block contents. // Lock should be held and all conditionals satisfied prior. func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) { - f, err := os.Open(mb.mfn) - if err != nil { - if os.IsNotExist(err) { - err = errNoBlkData + var f *os.File + // Re-use if we have mfd open. + if mb.mfd != nil { + f = mb.mfd + if n, err := f.Seek(0, 0); n != 0 || err != nil { + f = nil + mb.closeFDsLockedNoCheck() } - return nil, err } - defer f.Close() + if f == nil { + var err error + f, err = os.Open(mb.mfn) + if err != nil { + if os.IsNotExist(err) { + err = errNoBlkData + } + return nil, err + } + defer f.Close() + } var sz int if info, err := f.Stat(); err == nil { @@ -5430,16 +5607,34 @@ func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) mb.mu.Lock() defer mb.mu.Unlock() + fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) + if seq < fseq || seq > lseq { + return nil, false, ErrStoreMsgNotFound + } + + // See if we can short circuit if we already know msg deleted. + if mb.dmap.Exists(seq) { + // Update for scanning like cacheLookup would have. + llseq := mb.llseq + if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 { + mb.llseq = seq + } + expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1) + return nil, expireOk, errDeletedMsg + } + if mb.cacheNotLoaded() { if err := mb.loadMsgsWithLock(); err != nil { return nil, false, err } } + llseq := mb.llseq + fsm, err := mb.cacheLookup(seq, sm) if err != nil { return nil, false, err } - expireOk := seq == atomic.LoadUint64(&mb.last.seq) && mb.llseq == seq + expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1) return fsm, expireOk, err } @@ -5481,6 +5676,13 @@ func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { return nil, ErrStoreMsgNotFound } + // The llseq signals us when we can expire a cache at the end of a linear scan. + // We want to only update when we know the last reads (multiple consumers) are sequential. + // We want to account for forwards and backwards linear scans. + if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 { + mb.llseq = seq + } + // If we have a delete map check it. if mb.dmap.Exists(seq) { mb.llts = time.Now().UnixNano() @@ -5503,11 +5705,6 @@ func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { // Update cache activity. mb.llts = time.Now().UnixNano() - // The llseq signals us when we can expire a cache at the end of a linear scan. - // We want to only update when we know the last reads (multiple consumers) are sequential. - if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 { - mb.llseq = seq - } li := int(bi) - mb.cache.off if li >= len(mb.cache.buf) { @@ -5576,7 +5773,7 @@ func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) { seq = fs.state.FirstSeq } // Make sure to snapshot here. - mb, lmb, lseq := fs.selectMsgBlock(seq), fs.lmb, fs.state.LastSeq + mb, lseq := fs.selectMsgBlock(seq), fs.state.LastSeq fs.mu.RUnlock() if mb == nil { @@ -5594,7 +5791,7 @@ func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) { // We detected a linear scan and access to the last message. // If we are not the last message block we can try to expire the cache. - if mb != lmb && expireOk { + if expireOk { mb.tryForceExpireCache() } @@ -5767,12 +5964,14 @@ func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *Store for i := bi; i < len(fs.blks); i++ { mb := fs.blks[i] if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil { - if expireOk && mb != fs.lmb { + if expireOk { mb.tryForceExpireCache() } return sm, sm.seq, nil } else if err != ErrStoreMsgNotFound { return nil, 0, err + } else if expireOk { + mb.tryForceExpireCache() } } } @@ -5885,6 +6084,21 @@ func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 { return uint64(emptyRecordLen + slen + 4 + maxPayload) } +// Determine time since any last activity, read/load, write or remove. +func (mb *msgBlock) sinceLastActivity() time.Duration { + if mb.closed { + return 0 + } + last := mb.lwts + if mb.lrts > last { + last = mb.lrts + } + if mb.llts > last { + last = mb.llts + } + return time.Since(time.Unix(0, last).UTC()) +} + // Determine time since last write or remove of a message. // Read lock should be held. func (mb *msgBlock) sinceLastWriteActivity() time.Duration { @@ -6098,24 +6312,29 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint for i := 0; i < len(fs.blks); i++ { mb := fs.blks[i] mb.mu.Lock() - if err := mb.ensurePerSubjectInfoLoaded(); err != nil { - mb.mu.Unlock() - continue - } + + // If we do not have our fss, try to expire the cache if we have no items in this block. + shouldExpire := mb.fssNotLoaded() + t, f, l := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq)) if t == 0 { + // Expire if we were responsible for loading. + if shouldExpire { + // Expire this cache before moving on. + mb.tryForceExpireCacheLocked() + } mb.mu.Unlock() continue } - var shouldExpire bool + if sequence > 1 && sequence <= l { + l = sequence - 1 + } + if mb.cacheNotLoaded() { mb.loadMsgsWithLock() shouldExpire = true } - if sequence > 1 && sequence <= l { - l = sequence - 1 - } for seq := f; seq <= l; seq++ { if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) { @@ -6908,6 +7127,7 @@ func (mb *msgBlock) close(sync bool) { mb.ctmr = nil } + // Clear fss. mb.fss = nil // Close cache @@ -8835,5 +9055,4 @@ func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) { output = append(output, checksum...) return output, reader.Close() - } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go index cc23fd4b22..6b333878ab 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go @@ -71,11 +71,13 @@ type JetStreamAccountLimits struct { } type JetStreamTier struct { - Memory uint64 `json:"memory"` - Store uint64 `json:"storage"` - Streams int `json:"streams"` - Consumers int `json:"consumers"` - Limits JetStreamAccountLimits `json:"limits"` + Memory uint64 `json:"memory"` + Store uint64 `json:"storage"` + ReservedMemory uint64 `json:"reserved_memory"` + ReservedStore uint64 `json:"reserved_storage"` + Streams int `json:"streams"` + Consumers int `json:"consumers"` + Limits JetStreamAccountLimits `json:"limits"` } // JetStreamAccountStats returns current statistics about the account's JetStream usage. @@ -1580,6 +1582,40 @@ func diffCheckedLimits(a, b map[string]JetStreamAccountLimits) map[string]JetStr return diff } +// Return reserved bytes for memory and store for this account on this server. +// Lock should be held. +func (jsa *jsAccount) reservedStorage(tier string) (mem, store uint64) { + for _, mset := range jsa.streams { + cfg := &mset.cfg + if tier == _EMPTY_ || tier == tierName(cfg) && cfg.MaxBytes > 0 { + switch cfg.Storage { + case FileStorage: + store += uint64(cfg.MaxBytes) + case MemoryStorage: + mem += uint64(cfg.MaxBytes) + } + } + } + return mem, store +} + +// Return reserved bytes for memory and store for this account in clustered mode. +// js lock should be held. +func reservedStorage(sas map[string]*streamAssignment, tier string) (mem, store uint64) { + for _, sa := range sas { + cfg := sa.Config + if tier == _EMPTY_ || tier == tierName(cfg) && cfg.MaxBytes > 0 { + switch cfg.Storage { + case FileStorage: + store += uint64(cfg.MaxBytes) + case MemoryStorage: + mem += uint64(cfg.MaxBytes) + } + } + } + return mem, store +} + // JetStreamUsage reports on JetStream usage and limits for an account. func (a *Account) JetStreamUsage() JetStreamAccountStats { a.mu.RLock() @@ -1591,6 +1627,8 @@ func (a *Account) JetStreamUsage() JetStreamAccountStats { if jsa != nil { js := jsa.js js.mu.RLock() + cc := js.cluster + singleServer := cc == nil jsa.mu.RLock() jsa.usageMu.RLock() stats.Memory, stats.Store = jsa.storageTotals() @@ -1599,6 +1637,11 @@ func (a *Account) JetStreamUsage() JetStreamAccountStats { Total: jsa.apiTotal, Errors: jsa.apiErrors, } + if singleServer { + stats.ReservedMemory, stats.ReservedStore = jsa.reservedStorage(_EMPTY_) + } else { + stats.ReservedMemory, stats.ReservedStore = reservedStorage(cc.streams[aname], _EMPTY_) + } l, defaultTier := jsa.limits[_EMPTY_] if defaultTier { stats.Limits = l @@ -1611,27 +1654,42 @@ func (a *Account) JetStreamUsage() JetStreamAccountStats { // In case this shows an empty stream, that tier will be added when iterating over streams skipped++ } else { - stats.Tiers[t] = JetStreamTier{ + tier := JetStreamTier{ Memory: uint64(total.total.mem), Store: uint64(total.total.store), Limits: jsa.limits[t], } + if singleServer { + tier.ReservedMemory, tier.ReservedStore = jsa.reservedStorage(t) + } else { + tier.ReservedMemory, tier.ReservedStore = reservedStorage(cc.streams[aname], t) + } + stats.Tiers[t] = tier } } if len(accJsLimits) != len(jsa.usage)-skipped { // insert unused limits for t, lim := range accJsLimits { if _, ok := stats.Tiers[t]; !ok { - stats.Tiers[t] = JetStreamTier{Limits: lim} + tier := JetStreamTier{Limits: lim} + if singleServer { + tier.ReservedMemory, tier.ReservedStore = jsa.reservedStorage(t) + } else { + tier.ReservedMemory, tier.ReservedStore = reservedStorage(cc.streams[aname], t) + } + stats.Tiers[t] = tier } } } } jsa.usageMu.RUnlock() - if cc := jsa.js.cluster; cc != nil { + + // Clustered + if cc := js.cluster; cc != nil { sas := cc.streams[aname] if defaultTier { stats.Streams = len(sas) + stats.ReservedMemory, stats.ReservedStore = reservedStorage(sas, _EMPTY_) } for _, sa := range sas { stats.Consumers += len(sa.consumers) @@ -2086,7 +2144,7 @@ func (jsa *jsAccount) storageTotals() (uint64, uint64) { return mem, store } -func (jsa *jsAccount) limitsExceeded(storeType StorageType, tierName string) (bool, *ApiError) { +func (jsa *jsAccount) limitsExceeded(storeType StorageType, tierName string, replicas int) (bool, *ApiError) { jsa.usageMu.RLock() defer jsa.usageMu.RUnlock() @@ -2099,20 +2157,25 @@ func (jsa *jsAccount) limitsExceeded(storeType StorageType, tierName string) (bo // Imply totals of 0 return false, nil } + r := int64(replicas) + if r < 1 || tierName == _EMPTY_ { + r = 1 + } + // Since tiers are flat we need to scale limit up by replicas when checking. if storeType == MemoryStorage { totalMem := inUse.total.mem - if selectedLimits.MemoryMaxStreamBytes > 0 && totalMem > selectedLimits.MemoryMaxStreamBytes { + if selectedLimits.MemoryMaxStreamBytes > 0 && totalMem > selectedLimits.MemoryMaxStreamBytes*r { return true, nil } - if selectedLimits.MaxMemory >= 0 && totalMem > selectedLimits.MaxMemory { + if selectedLimits.MaxMemory >= 0 && totalMem > selectedLimits.MaxMemory*r { return true, nil } } else { totalStore := inUse.total.store - if selectedLimits.StoreMaxStreamBytes > 0 && totalStore > selectedLimits.StoreMaxStreamBytes { + if selectedLimits.StoreMaxStreamBytes > 0 && totalStore > selectedLimits.StoreMaxStreamBytes*r { return true, nil } - if selectedLimits.MaxStore >= 0 && totalStore > selectedLimits.MaxStore { + if selectedLimits.MaxStore >= 0 && totalStore > selectedLimits.MaxStore*r { return true, nil } } @@ -2141,28 +2204,22 @@ func (js *jetStream) checkLimits(selected *JetStreamAccountLimits, config *Strea } // stream limit is checked separately on stream create only! // Check storage, memory or disk. - return js.checkBytesLimits(selected, config.MaxBytes, config.Storage, config.Replicas, checkServer, currentRes, maxBytesOffset) + return js.checkBytesLimits(selected, config.MaxBytes, config.Storage, checkServer, currentRes, maxBytesOffset) } // Check if additional bytes will exceed our account limits and optionally the server itself. -// This should account for replicas. // Read Lock should be held. -func (js *jetStream) checkBytesLimits(selectedLimits *JetStreamAccountLimits, addBytes int64, storage StorageType, replicas int, checkServer bool, currentRes, maxBytesOffset int64) error { - if replicas < 1 { - replicas = 1 - } +func (js *jetStream) checkBytesLimits(selectedLimits *JetStreamAccountLimits, addBytes int64, storage StorageType, checkServer bool, currentRes, maxBytesOffset int64) error { if addBytes < 0 { addBytes = 1 } - totalBytes := (addBytes * int64(replicas)) + maxBytesOffset + totalBytes := addBytes + maxBytesOffset switch storage { case MemoryStorage: // Account limits defined. - if selectedLimits.MaxMemory >= 0 { - if currentRes+totalBytes > selectedLimits.MaxMemory { - return NewJSMemoryResourcesExceededError() - } + if selectedLimits.MaxMemory >= 0 && currentRes+totalBytes > selectedLimits.MaxMemory { + return NewJSMemoryResourcesExceededError() } // Check if this server can handle request. if checkServer && js.memReserved+addBytes > js.config.MaxMemory { @@ -2170,10 +2227,8 @@ func (js *jetStream) checkBytesLimits(selectedLimits *JetStreamAccountLimits, ad } case FileStorage: // Account limits defined. - if selectedLimits.MaxStore >= 0 { - if currentRes+totalBytes > selectedLimits.MaxStore { - return NewJSStorageResourcesExceededError() - } + if selectedLimits.MaxStore >= 0 && currentRes+totalBytes > selectedLimits.MaxStore { + return NewJSStorageResourcesExceededError() } // Check if this server can handle request. if checkServer && js.storeReserved+addBytes > js.config.MaxStore { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go index c4163c853c..76c1c2a7ba 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go @@ -21,6 +21,7 @@ import ( "math/rand" "os" "path/filepath" + "runtime" "sort" "strconv" "strings" @@ -217,6 +218,9 @@ const ( JSApiServerStreamCancelMove = "$JS.API.ACCOUNT.STREAM.CANCEL_MOVE.*.*" JSApiServerStreamCancelMoveT = "$JS.API.ACCOUNT.STREAM.CANCEL_MOVE.%s.%s" + // The prefix for system level account API. + jsAPIAccountPre = "$JS.API.ACCOUNT." + // jsAckT is the template for the ack message stream coming back from a consumer // when they ACK/NAK, etc a message. jsAckT = "$JS.ACK.%s.%s" @@ -346,6 +350,8 @@ type ApiResponse struct { Error *ApiError `json:"error,omitempty"` } +const JSApiSystemResponseType = "io.nats.jetstream.api.v1.system_response" + // When passing back to the clients generalize store failures. var ( errStreamStoreFailed = errors.New("error creating store for stream") @@ -738,26 +744,59 @@ type jsAPIRoutedReq struct { } func (js *jetStream) apiDispatch(sub *subscription, c *client, acc *Account, subject, reply string, rmsg []byte) { + // Ignore system level directives meta stepdown and peer remove requests here. + if subject == JSApiLeaderStepDown || + subject == JSApiRemoveServer || + strings.HasPrefix(subject, jsAPIAccountPre) { + return + } // No lock needed, those are immutable. s, rr := js.srv, js.apiSubs.Match(subject) - hdr, _ := c.msgParts(rmsg) + hdr, msg := c.msgParts(rmsg) if len(getHeader(ClientInfoHdr, hdr)) == 0 { // Check if this is the system account. We will let these through for the account info only. - if s.SystemAccount() != acc || subject != JSApiAccountInfo { + sacc := s.SystemAccount() + if sacc != acc { + return + } + if subject != JSApiAccountInfo { + // Only respond from the initial server entry to the NATS system. + if c.kind == CLIENT || c.kind == LEAF { + var resp = ApiResponse{ + Type: JSApiSystemResponseType, + Error: NewJSNotEnabledForAccountError(), + } + s.sendAPIErrResponse(nil, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + } return } } - // Shortcircuit. + // Short circuit for no interest. if len(rr.psubs)+len(rr.qsubs) == 0 { + if (c.kind == CLIENT || c.kind == LEAF) && acc != s.SystemAccount() { + ci, acc, _, _, _ := s.getRequestInfo(c, rmsg) + var resp = ApiResponse{ + Type: JSApiSystemResponseType, + Error: NewJSBadRequestError(), + } + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + } return } // We should only have psubs and only 1 per result. - // FIXME(dlc) - Should we respond here with NoResponders or error? if len(rr.psubs) != 1 { s.Warnf("Malformed JetStream API Request: [%s] %q", subject, rmsg) + if c.kind == CLIENT || c.kind == LEAF { + ci, acc, _, _, _ := s.getRequestInfo(c, rmsg) + var resp = ApiResponse{ + Type: JSApiSystemResponseType, + Error: NewJSBadRequestError(), + } + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + } return } jsub := rr.psubs[0] @@ -774,20 +813,30 @@ func (js *jetStream) apiDispatch(sub *subscription, c *client, acc *Account, sub // If we are here we have received this request over a non-client connection. // We need to make sure not to block. We will send the request to a long-lived - // go routine. + // pool of go routines. + + // Increment inflight. Do this before queueing. + atomic.AddInt64(&js.apiInflight, 1) // Copy the state. Note the JSAPI only uses the hdr index to piece apart the // header from the msg body. No other references are needed. - s.jsAPIRoutedReqs.push(&jsAPIRoutedReq{jsub, sub, acc, subject, reply, copyBytes(rmsg), c.pa}) + // Check pending and warn if getting backed up. + const warnThresh = 32 + pending := s.jsAPIRoutedReqs.push(&jsAPIRoutedReq{jsub, sub, acc, subject, reply, copyBytes(rmsg), c.pa}) + if pending > warnThresh { + s.RateLimitWarnf("JetStream request queue has high pending count: %d", pending) + } } func (s *Server) processJSAPIRoutedRequests() { defer s.grWG.Done() - s.mu.Lock() + s.mu.RLock() queue := s.jsAPIRoutedReqs client := &client{srv: s, kind: JETSTREAM} - s.mu.Unlock() + s.mu.RUnlock() + + js := s.getJetStream() for { select { @@ -800,6 +849,7 @@ func (s *Server) processJSAPIRoutedRequests() { if dur := time.Since(start); dur >= readLoopReportThreshold { s.Warnf("Internal subscription on %q took too long: %v", r.subject, dur) } + atomic.AddInt64(&js.apiInflight, -1) } queue.recycle(&reqs) case <-s.quitCh: @@ -816,8 +866,16 @@ func (s *Server) setJetStreamExportSubs() error { // Start the go routine that will process API requests received by the // subscription below when they are coming from routes, etc.. + const maxProcs = 16 + mp := runtime.GOMAXPROCS(0) + // Cap at 16 max for now on larger core setups. + if mp > maxProcs { + mp = maxProcs + } s.jsAPIRoutedReqs = newIPQueue[*jsAPIRoutedReq](s, "Routed JS API Requests") - s.startGoRoutine(s.processJSAPIRoutedRequests) + for i := 0; i < mp; i++ { + s.startGoRoutine(s.processJSAPIRoutedRequests) + } // This is the catch all now for all JetStream API calls. if _, err := s.sysSubscribe(jsAllAPI, js.apiDispatch); err != nil { @@ -3659,7 +3717,7 @@ func (s *Server) streamSnapshot(ci *ClientInfo, acc *Account, mset *stream, sr * // We will place sequence number and size of chunk sent in the reply. ackSubj := fmt.Sprintf(jsSnapshotAckT, mset.name(), nuid.Next()) - ackSub, _ := mset.subscribeInternalUnlocked(ackSubj+".>", func(_ *subscription, _ *client, _ *Account, subject, _ string, _ []byte) { + ackSub, _ := mset.subscribeInternal(ackSubj+".>", func(_ *subscription, _ *client, _ *Account, subject, _ string, _ []byte) { cs, _ := strconv.Atoi(tokenAt(subject, 6)) // This is very crude and simple, but ok for now. // This only matters when sending multiple chunks. @@ -3670,7 +3728,7 @@ func (s *Server) streamSnapshot(ci *ClientInfo, acc *Account, mset *stream, sr * } } }) - defer mset.unsubscribeUnlocked(ackSub) + defer mset.unsubscribe(ackSub) // TODO(dlc) - Add in NATS-Chunked-Sequence header diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go index 5354e974b3..770b9957e7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go @@ -103,6 +103,8 @@ const ( removePendingRequest // For sending compressed streams, either through RAFT or catchup. compressedStreamMsgOp + // For sending deleted gaps on catchups for replicas. + deleteRangeOp ) // raftGroups are controlled by the metagroup controller. @@ -338,15 +340,16 @@ func (s *Server) JetStreamSnapshotStream(account, stream string) error { return err } - mset.mu.RLock() - if !mset.node.Leader() { - mset.mu.RUnlock() - return NewJSNotEnabledForAccountError() + // Hold lock when installing snapshot. + mset.mu.Lock() + if mset.node == nil { + mset.mu.Unlock() + return nil } - n := mset.node - mset.mu.RUnlock() + err = mset.node.InstallSnapshot(mset.stateSnapshotLocked()) + mset.mu.Unlock() - return n.InstallSnapshot(mset.stateSnapshot()) + return err } func (s *Server) JetStreamClusterPeers() []string { @@ -489,24 +492,24 @@ func (js *jetStream) restartStream(acc *Account, csa *streamAssignment) { // isStreamHealthy will determine if the stream is up to date or very close. // For R1 it will make sure the stream is present on this server. func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool { - js.mu.Lock() + js.mu.RLock() s, cc := js.srv, js.cluster if cc == nil { // Non-clustered mode - js.mu.Unlock() + js.mu.RUnlock() return true } // Pull the group out. rg := sa.Group if rg == nil { - js.mu.Unlock() + js.mu.RUnlock() return false } streamName := sa.Config.Name node := rg.node - js.mu.Unlock() + js.mu.RUnlock() // First lookup stream and make sure its there. mset, err := acc.lookupStream(streamName) @@ -515,6 +518,11 @@ func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool { return false } + // If we are catching up return false. + if mset.isCatchingUp() { + return false + } + if node == nil || node.Healthy() { // Check if we are processing a snapshot and are catching up. if !mset.isCatchingUp() { @@ -1238,7 +1246,8 @@ func (js *jetStream) monitorCluster() { lt := time.NewTicker(leaderCheckInterval) defer lt.Stop() - const healthCheckInterval = 2 * time.Minute + // Check the general health once an hour. + const healthCheckInterval = 1 * time.Hour ht := time.NewTicker(healthCheckInterval) defer ht.Stop() @@ -1454,18 +1463,22 @@ func (js *jetStream) clusterStreamConfig(accName, streamName string) (StreamConf } func (js *jetStream) metaSnapshot() []byte { - var streams []writeableStreamAssignment - js.mu.RLock() cc := js.cluster + nsa := 0 + for _, asa := range cc.streams { + nsa += len(asa) + } + streams := make([]writeableStreamAssignment, 0, nsa) for _, asa := range cc.streams { for _, sa := range asa { wsa := writeableStreamAssignment{ - Client: sa.Client, - Created: sa.Created, - Config: sa.Config, - Group: sa.Group, - Sync: sa.Sync, + Client: sa.Client, + Created: sa.Created, + Config: sa.Config, + Group: sa.Group, + Sync: sa.Sync, + Consumers: make([]*consumerAssignment, 0, len(sa.consumers)), } for _, ca := range sa.consumers { wsa.Consumers = append(wsa.Consumers, ca) @@ -1934,6 +1947,9 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bo delete(ru.removeStreams, key) } else { js.processUpdateStreamAssignment(sa) + // Since an update can be lowering replica count, we want upper layer to treat + // similar to a removal and snapshot to collapse old entries. + didRemoveStream = true } default: panic(fmt.Sprintf("JetStream Cluster Unknown meta entry op type: %v", entryOp(buf[0]))) @@ -2192,9 +2208,13 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps var lastState SimpleState var lastSnapTime time.Time + // Don't allow the upper layer to install snapshots until we have + // fully recovered from disk. + isRecovering := true + // Should only to be called from leader. doSnapshot := func() { - if mset == nil || isRestore || time.Since(lastSnapTime) < minSnapDelta { + if mset == nil || isRecovering || isRestore || time.Since(lastSnapTime) < minSnapDelta { return } @@ -2221,7 +2241,6 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps // We will establish a restoreDoneCh no matter what. Will never be triggered unless // we replace with the restore chan. restoreDoneCh := make(<-chan error) - isRecovering := true // For migration tracking. var mmt *time.Ticker @@ -2671,7 +2690,7 @@ func (mset *stream) isMigrating() bool { func (mset *stream) resetClusteredState(err error) bool { mset.mu.RLock() s, js, jsa, sa, acc, node := mset.srv, mset.js, mset.jsa, mset.sa, mset.acc, mset.node - stype, isLeader, tierName := mset.cfg.Storage, mset.isLeader(), mset.tier + stype, isLeader, tierName, replicas := mset.cfg.Storage, mset.isLeader(), mset.tier, mset.cfg.Replicas mset.mu.RUnlock() // Stepdown regardless if we are the leader here. @@ -2687,12 +2706,12 @@ func (mset *stream) resetClusteredState(err error) bool { // Server if js.limitsExceeded(stype) { - s.Debugf("Will not reset stream, server resources exceeded") + s.Warnf("Will not reset stream, server resources exceeded") return false } // Account - if exceeded, _ := jsa.limitsExceeded(stype, tierName); exceeded { + if exceeded, _ := jsa.limitsExceeded(stype, tierName, replicas); exceeded { s.Warnf("stream '%s > %s' errored, account resources exceeded", acc, mset.name()) return false } @@ -3603,20 +3622,23 @@ func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignme } } mset.setStreamAssignment(sa) - if err = mset.updateWithAdvisory(sa.Config, false); err != nil { - s.Warnf("JetStream cluster error updating stream %q for account %q: %v", sa.Config.Name, acc.Name, err) - if osa != nil { - // Process the raft group and make sure it's running if needed. - js.createRaftGroup(acc.GetName(), osa.Group, storage, pprofLabels{ - "type": "stream", - "account": mset.accName(), - "stream": mset.name(), - }) - mset.setStreamAssignment(osa) - } - if rg.node != nil { - rg.node.Delete() - rg.node = nil + // Check if our config has really been updated. + if !reflect.DeepEqual(mset.config(), sa.Config) { + if err = mset.updateWithAdvisory(sa.Config, false); err != nil { + s.Warnf("JetStream cluster error updating stream %q for account %q: %v", sa.Config.Name, acc.Name, err) + if osa != nil { + // Process the raft group and make sure it's running if needed. + js.createRaftGroup(acc.GetName(), osa.Group, storage, pprofLabels{ + "type": "stream", + "account": mset.accName(), + "stream": mset.name(), + }) + mset.setStreamAssignment(osa) + } + if rg.node != nil { + rg.node.Delete() + rg.node = nil + } } } } else if err == NewJSStreamNotFoundError() { @@ -4544,9 +4566,13 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { var lastSnap []byte var lastSnapTime time.Time + // Don't allow the upper layer to install snapshots until we have + // fully recovered from disk. + recovering := true + doSnapshot := func(force bool) { // Bail if trying too fast and not in a forced situation. - if !force && time.Since(lastSnapTime) < minSnapDelta { + if recovering || (!force && time.Since(lastSnapTime) < minSnapDelta) { return } @@ -4597,7 +4623,6 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { // Track if we are leader. var isLeader bool - recovering := true for { select { @@ -5690,25 +5715,18 @@ func groupName(prefix string, peers []string, storage StorageType) string { // returns stream count for this tier as well as applicable reservation size (not including reservations for cfg) // jetStream read lock should be held func tieredStreamAndReservationCount(asa map[string]*streamAssignment, tier string, cfg *StreamConfig) (int, int64) { - numStreams := len(asa) - reservation := int64(0) - if tier == _EMPTY_ { - for _, sa := range asa { - if sa.Config.MaxBytes > 0 && sa.Config.Name != cfg.Name { - if sa.Config.Storage == cfg.Storage { - reservation += (int64(sa.Config.Replicas) * sa.Config.MaxBytes) - } - } - } - } else { - numStreams = 0 - for _, sa := range asa { - if isSameTier(sa.Config, cfg) { - numStreams++ - if sa.Config.MaxBytes > 0 { - if sa.Config.Storage == cfg.Storage && sa.Config.Name != cfg.Name { - reservation += (int64(sa.Config.Replicas) * sa.Config.MaxBytes) - } + var numStreams int + var reservation int64 + for _, sa := range asa { + if tier == _EMPTY_ || isSameTier(sa.Config, cfg) { + numStreams++ + if sa.Config.MaxBytes > 0 && sa.Config.Storage == cfg.Storage && sa.Config.Name != cfg.Name { + // If tier is empty, all storage is flat and we should adjust for replicas. + // Otherwise if tiered, storage replication already taken into consideration. + if tier == _EMPTY_ && cfg.Replicas > 1 { + reservation += sa.Config.MaxBytes * int64(cfg.Replicas) + } else { + reservation += sa.Config.MaxBytes } } } @@ -5812,40 +5830,18 @@ func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, js.mu.Lock() defer js.mu.Unlock() - // Capture if we have existing assignment first. - osa := js.streamAssignment(acc.Name, cfg.Name) - var areEqual bool - if osa != nil { - areEqual = reflect.DeepEqual(osa.Config, cfg) - } + var self *streamAssignment + var rg *raftGroup - // If this stream already exists, turn this into a stream info call. - if osa != nil { - // If they are the same then we will forward on as a stream info request. - // This now matches single server behavior. - if areEqual { - // This works when we have a stream leader. If we have no leader let the dupe - // go through as normal. We will handle properly on the other end. - // We must check interest at the $SYS account layer, not user account since import - // will always show interest. - sisubj := fmt.Sprintf(clusterStreamInfoT, acc.Name, cfg.Name) - if s.SystemAccount().Interest(sisubj) > 0 { - isubj := fmt.Sprintf(JSApiStreamInfoT, cfg.Name) - // We want to make sure we send along the client info. - cij, _ := json.Marshal(ci) - hdr := map[string]string{ - ClientInfoHdr: string(cij), - JSResponseType: jsCreateResponse, - } - // Send this as system account, but include client info header. - s.sendInternalAccountMsgWithReply(nil, isubj, reply, hdr, nil, true) - return - } - } else { + // Capture if we have existing assignment first. + if osa := js.streamAssignment(acc.Name, cfg.Name); osa != nil { + if !reflect.DeepEqual(osa.Config, cfg) { resp.Error = NewJSStreamNameExistError() s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) return } + // This is an equal assignment. + self, rg = osa, osa.Group } if cfg.Sealed { @@ -5854,11 +5850,6 @@ func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, return } - var self *streamAssignment - if osa != nil && areEqual { - self = osa - } - // Check for subject collisions here. if cc.subjectsOverlap(acc.Name, cfg.Subjects, self) { resp.Error = NewJSStreamSubjectOverlapError() @@ -5875,10 +5866,7 @@ func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, } // Raft group selection and placement. - var rg *raftGroup - if osa != nil && areEqual { - rg = osa.Group - } else { + if rg == nil { // Check inflight before proposing in case we have an existing inflight proposal. if cc.inflight == nil { cc.inflight = make(map[string]map[string]*raftGroup) @@ -5892,7 +5880,7 @@ func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, rg = existing } } - // Create a new one here. + // Create a new one here if needed. if rg == nil { nrg, err := js.createGroupForStream(ci, cfg) if err != nil { @@ -6860,6 +6848,22 @@ func decodeStreamAssignment(buf []byte) (*streamAssignment, error) { return &sa, err } +func encodeDeleteRange(dr *DeleteRange) []byte { + var bb bytes.Buffer + bb.WriteByte(byte(deleteRangeOp)) + json.NewEncoder(&bb).Encode(dr) + return bb.Bytes() +} + +func decodeDeleteRange(buf []byte) (*DeleteRange, error) { + var dr DeleteRange + err := json.Unmarshal(buf, &dr) + if err != nil { + return nil, err + } + return &dr, err +} + // createGroupForConsumer will create a new group from same peer set as the stream. func (cc *jetStreamCluster) createGroupForConsumer(cfg *ConsumerConfig, sa *streamAssignment) *raftGroup { if len(sa.Group.Peers) == 0 || cfg.Replicas > len(sa.Group.Peers) { @@ -7433,7 +7437,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ mset.mu.RLock() canRespond := !mset.cfg.NoAck && len(reply) > 0 name, stype, store := mset.cfg.Name, mset.cfg.Storage, mset.store - s, js, jsa, st, rf, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node + s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, int64(mset.cfg.Replicas), mset.tier, mset.outq, mset.node maxMsgSize, lseq, clfs := int(mset.cfg.MaxMsgSize), mset.lseq, mset.clfs isLeader, isSealed := mset.isLeader(), mset.cfg.Sealed mset.mu.RUnlock() @@ -7491,16 +7495,20 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ t = &jsaStorage{} jsa.usage[tierName] = t } - if st == MemoryStorage { - total := t.total.store + int64(memStoreMsgSize(subject, hdr, msg)*uint64(rf)) - if jsaLimits.MaxMemory > 0 && total > jsaLimits.MaxMemory { - exceeded = true - } - } else { - total := t.total.store + int64(fileStoreMsgSize(subject, hdr, msg)*uint64(rf)) - if jsaLimits.MaxStore > 0 && total > jsaLimits.MaxStore { - exceeded = true - } + // Make sure replicas is correct. + if r < 1 { + r = 1 + } + // This is for limits. If we have no tier, consider all to be flat, vs tiers like R3 where we want to scale limit by replication. + lr := r + if tierName == _EMPTY_ { + lr = 1 + } + // Tiers are flat, meaning the limit for R3 will be 100GB, not 300GB, so compare to total but adjust limits. + if st == MemoryStorage && jsaLimits.MaxMemory > 0 { + exceeded = t.total.mem+(int64(memStoreMsgSize(subject, hdr, msg))*r) > (jsaLimits.MaxMemory * lr) + } else if jsaLimits.MaxStore > 0 { + exceeded = t.total.store+(int64(fileStoreMsgSize(subject, hdr, msg))*r) > (jsaLimits.MaxStore * lr) } jsa.usageMu.Unlock() @@ -7626,9 +7634,10 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // For requesting messages post raft snapshot to catch up streams post server restart. // Any deleted msgs etc will be handled inline on catchup. type streamSyncRequest struct { - Peer string `json:"peer,omitempty"` - FirstSeq uint64 `json:"first_seq"` - LastSeq uint64 `json:"last_seq"` + Peer string `json:"peer,omitempty"` + FirstSeq uint64 `json:"first_seq"` + LastSeq uint64 `json:"last_seq"` + DeleteRangesOk bool `json:"delete_ranges"` } // Given a stream state that represents a snapshot, calculate the sync request based on our current state. @@ -7637,7 +7646,7 @@ func (mset *stream) calculateSyncRequest(state *StreamState, snap *StreamReplica if state.LastSeq >= snap.LastSeq { return nil } - return &streamSyncRequest{FirstSeq: state.LastSeq + 1, LastSeq: snap.LastSeq, Peer: mset.node.ID()} + return &streamSyncRequest{FirstSeq: state.LastSeq + 1, LastSeq: snap.LastSeq, Peer: mset.node.ID(), DeleteRangesOk: true} } // processSnapshotDeletes will update our current store based on the snapshot @@ -7647,14 +7656,22 @@ func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) { var state StreamState mset.store.FastState(&state) // Always adjust if FirstSeq has moved beyond our state. + var didReset bool if snap.FirstSeq > state.FirstSeq { mset.store.Compact(snap.FirstSeq) mset.store.FastState(&state) mset.lseq = state.LastSeq mset.clearAllPreAcksBelowFloor(state.FirstSeq) + didReset = true } + s := mset.srv mset.mu.Unlock() + if didReset { + s.Warnf("Catchup for stream '%s > %s' resetting first sequence: %d on catchup request", + mset.account(), mset.name(), snap.FirstSeq) + } + if len(snap.Deleted) > 0 { mset.store.SyncDeleted(snap.Deleted) } @@ -7684,6 +7701,22 @@ func (mset *stream) updateCatchupPeer(peer string) { mset.mu.Unlock() } +func (mset *stream) decrementCatchupPeer(peer string, num uint64) { + if peer == _EMPTY_ { + return + } + mset.mu.Lock() + if lag := mset.catchups[peer]; lag > 0 { + if lag >= num { + lag -= num + } else { + lag = 0 + } + mset.catchups[peer] = lag + } + mset.mu.Unlock() +} + func (mset *stream) clearCatchupPeer(peer string) { mset.mu.Lock() if mset.catchups != nil { @@ -7715,21 +7748,15 @@ func (mset *stream) hasCatchupPeers() bool { } func (mset *stream) setCatchingUp() { - mset.mu.Lock() - mset.catchup = true - mset.mu.Unlock() + mset.catchup.Store(true) } func (mset *stream) clearCatchingUp() { - mset.mu.Lock() - mset.catchup = false - mset.mu.Unlock() + mset.catchup.Store(false) } func (mset *stream) isCatchingUp() bool { - mset.mu.RLock() - defer mset.mu.RUnlock() - return mset.catchup + return mset.catchup.Load() } // Determine if a non-leader is current. @@ -7738,7 +7765,7 @@ func (mset *stream) isCurrent() bool { if mset.node == nil { return true } - return mset.node.Current() && !mset.catchup + return mset.node.Current() && !mset.catchup.Load() } // Maximum requests for the whole server that can be in flight at the same time. @@ -7762,7 +7789,6 @@ func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) { mset.store.FastState(&state) mset.setCLFS(snap.Failed) sreq := mset.calculateSyncRequest(&state, snap) - s, js, subject, n := mset.srv, mset.js, mset.sa.Sync, mset.node qname := fmt.Sprintf("[ACC:%s] stream '%s' snapshot", mset.acc.Name, mset.cfg.Name) mset.mu.Unlock() @@ -7796,7 +7822,7 @@ func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) { var sub *subscription var err error - const activityInterval = 10 * time.Second + const activityInterval = 30 * time.Second notActive := time.NewTimer(activityInterval) defer notActive.Stop() @@ -7832,36 +7858,12 @@ func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) { defer releaseSyncOutSem() // Check our final state when we exit cleanly. - // If this snapshot was for messages no longer held by the leader we want to make sure - // we are synched for the next message sequence properly. - lastRequested := sreq.LastSeq + // This will make sure we have interest consumers updated. checkFinalState := func() { // Bail if no stream. if mset == nil { return } - - mset.mu.Lock() - var state StreamState - mset.store.FastState(&state) - var didReset bool - firstExpected := lastRequested + 1 - if state.FirstSeq != firstExpected { - // Reset our notion of first. - mset.store.Compact(firstExpected) - mset.store.FastState(&state) - // Make sure last is also correct in case this also moved. - mset.lseq = state.LastSeq - mset.clearAllPreAcksBelowFloor(state.FirstSeq) - didReset = true - } - mset.mu.Unlock() - - if didReset { - s.Warnf("Catchup for stream '%s > %s' resetting first sequence: %d on catchup complete", - mset.account(), mset.name(), firstExpected) - } - mset.mu.RLock() consumers := make([]*consumer, 0, len(mset.consumers)) for _, o := range mset.consumers { @@ -7907,7 +7909,7 @@ RETRY: <-s.syncOutSem releaseSem = true - // We may have been blocked for a bit, so the reset need to ensure that we + // We may have been blocked for a bit, so the reset needs to ensure that we // consume the already fired timer. if !notActive.Stop() { select { @@ -7927,8 +7929,6 @@ RETRY: if sreq == nil { return nil } - // Reset notion of lastRequested - lastRequested = sreq.LastSeq } // Used to transfer message from the wire to another Go routine internally. @@ -7951,22 +7951,20 @@ RETRY: // Send our catchup request here. reply := syncReplySubject() sub, err = s.sysSubscribe(reply, func(_ *subscription, _ *client, _ *Account, _, reply string, msg []byte) { - // Make copies - // TODO(dlc) - Since we are using a buffer from the inbound client/route. + // Make copy since we are using a buffer from the inbound client/route. msgsQ.push(&im{copyBytes(msg), reply}) }) if err != nil { s.Errorf("Could not subscribe to stream catchup: %v", err) goto RETRY } + // Send our sync request. b, _ := json.Marshal(sreq) s.sendInternalMsgLocked(subject, reply, nil, b) - // Remember when we sent this out to avoimd loop spins on errors below. + // Remember when we sent this out to avoid loop spins on errors below. reqSendTime := time.Now() - - // Clear our sync request and capture last. - last := sreq.LastSeq + // Clear our sync request. sreq = nil // Run our own select loop here. @@ -7976,24 +7974,18 @@ RETRY: notActive.Reset(activityInterval) mrecs := msgsQ.pop() - for _, mrec := range mrecs { msg := mrec.msg - // Check for eof signaling. if len(msg) == 0 { msgsQ.recycle(&mrecs) checkFinalState() return nil } - if lseq, err := mset.processCatchupMsg(msg); err == nil { + if _, err := mset.processCatchupMsg(msg); err == nil { if mrec.reply != _EMPTY_ { s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, nil) } - if lseq >= last { - msgsQ.recycle(&mrecs) - return nil - } } else if isOutOfSpaceErr(err) { notifyLeaderStopCatchup(mrec, err) return err @@ -8026,6 +8018,7 @@ RETRY: goto RETRY } } + notActive.Reset(activityInterval) msgsQ.recycle(&mrecs) case <-notActive.C: if mrecs := msgsQ.pop(); len(mrecs) > 0 { @@ -8054,11 +8047,34 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { return 0, errCatchupBadMsg } op := entryOp(msg[0]) - if op != streamMsgOp && op != compressedStreamMsgOp { + if op != streamMsgOp && op != compressedStreamMsgOp && op != deleteRangeOp { return 0, errCatchupBadMsg } mbuf := msg[1:] + if op == deleteRangeOp { + dr, err := decodeDeleteRange(mbuf) + if err != nil { + return 0, errCatchupBadMsg + } + // Handle the delete range. + // Make sure the sequences match up properly. + mset.mu.Lock() + if len(mset.preAcks) > 0 { + for seq := dr.First; seq < dr.First+dr.Num; seq++ { + mset.clearAllPreAcks(seq) + } + } + if err = mset.store.SkipMsgs(dr.First, dr.Num); err != nil { + mset.mu.Unlock() + return 0, errCatchupWrongSeqForSkip + } + mset.lseq = dr.First + dr.Num - 1 + lseq := mset.lseq + mset.mu.Unlock() + return lseq, nil + } + if op == compressedStreamMsgOp { var err error mbuf, err = s2.Decode(nil, mbuf) @@ -8076,6 +8092,7 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { st := mset.cfg.Storage ddloaded := mset.ddloaded tierName := mset.tier + replicas := mset.cfg.Replicas if mset.hasAllPreAcks(seq, subj) { mset.clearAllPreAcks(seq) @@ -8086,7 +8103,7 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { if mset.js.limitsExceeded(st) { return 0, NewJSInsufficientResourcesError() - } else if exceeded, apiErr := mset.jsa.limitsExceeded(st, tierName); apiErr != nil { + } else if exceeded, apiErr := mset.jsa.limitsExceeded(st, tierName, replicas); apiErr != nil { return 0, apiErr } else if exceeded { return 0, NewJSInsufficientResourcesError() @@ -8400,8 +8417,8 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { s := mset.srv defer s.grWG.Done() - const maxOutBytes = int64(8 * 1024 * 1024) // 8MB for now, these are all internal, from server to server - const maxOutMsgs = int32(32 * 1024) + const maxOutBytes = int64(64 * 1024 * 1024) // 64MB for now, these are all internal, from server to server + const maxOutMsgs = int32(256 * 1024) // 256k in case we have lots of small messages or skip msgs. outb := int64(0) outm := int32(0) @@ -8420,6 +8437,10 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { nextBatchC <- struct{}{} remoteQuitCh := make(chan struct{}) + const activityInterval = 30 * time.Second + notActive := time.NewTimer(activityInterval) + defer notActive.Stop() + // Setup ackReply for flow control. ackReply := syncAckSubject() ackSub, _ := s.sysSubscribe(ackReply, func(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { @@ -8437,16 +8458,14 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { // Kick ourselves and anyone else who might have stalled on global state. select { case nextBatchC <- struct{}{}: + // Reset our activity + notActive.Reset(activityInterval) default: } }) defer s.sysUnsubscribe(ackSub) ackReplyT := strings.ReplaceAll(ackReply, ".*", ".%d") - const activityInterval = 5 * time.Second - notActive := time.NewTimer(activityInterval) - defer notActive.Stop() - // Grab our state. var state StreamState mset.mu.RLock() @@ -8469,10 +8488,9 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { compressOk := mset.compressAllowed() var spb int - sendNextBatchAndContinue := func(qch chan struct{}) bool { - // Update our activity timer. - notActive.Reset(activityInterval) + const minWait = 5 * time.Second + sendNextBatchAndContinue := func(qch chan struct{}) bool { // Check if we know we will not enter the loop because we are done. if seq > last { s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) @@ -8481,15 +8499,23 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { return false } - // If we already sent a batch, we will try to make sure we process around - // half the FC responses - or reach a certain amount of time - before sending - // the next batch. + // If we already sent a batch, we will try to make sure we can at least send a minimum + // batch before sending the next batch. if spb > 0 { - mw := time.NewTimer(100 * time.Millisecond) + // Wait til we can send at least 4k + const minBatchWait = int32(4 * 1024) + mw := time.NewTimer(minWait) for done := false; !done; { select { case <-nextBatchC: - done = int(atomic.LoadInt32(&outm)) <= spb/2 + done = maxOutMsgs-atomic.LoadInt32(&outm) > minBatchWait + if !done { + // Wait for a small bit. + time.Sleep(50 * time.Millisecond) + } else { + // GC friendly. + mw.Stop() + } case <-mw.C: done = true case <-s.quitCh: @@ -8503,8 +8529,38 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { spb = 0 } - var smv StoreMsg + // Send an encoded msg. + sendEM := func(em []byte) { + // Place size in reply subject for flow control. + l := int64(len(em)) + reply := fmt.Sprintf(ackReplyT, l) + s.gcbAdd(&outb, l) + atomic.AddInt32(&outm, 1) + s.sendInternalMsgLocked(sendSubject, reply, nil, em) + spb++ + } + // If we support gap markers. + var dr DeleteRange + drOk := sreq.DeleteRangesOk + + // Will send our delete range. + // Should already be checked for being valid. + sendDR := func() { + if dr.Num == 1 { + // Send like a normal skip msg. + sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, dr.First, 0)) + } else { + // We have a run, send a gap record. We send these without reply or tracking. + s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, encodeDeleteRange(&dr)) + // Clear out the pending for catchup. + mset.decrementCatchupPeer(sreq.Peer, dr.Num) + } + // Reset always. + dr.First, dr.Num = 0, 0 + } + + var smv StoreMsg for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs && s.gcbBelowMax(); seq++ { sm, err := mset.store.LoadMsg(seq, &smv) // if this is not a deleted msg, bail out. @@ -8530,22 +8586,33 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { s.Warnf("Error loading message for catchup '%s > %s': %v", mset.account(), mset.name(), err) return false } - var em []byte + if sm != nil { - em = encodeStreamMsgAllowCompress(sm.subj, _EMPTY_, sm.hdr, sm.msg, sm.seq, sm.ts, compressOk) + // If we allow gap markers check if we have one pending. + if drOk && dr.First > 0 { + sendDR() + } + // Send the normal message now. + sendEM(encodeStreamMsgAllowCompress(sm.subj, _EMPTY_, sm.hdr, sm.msg, sm.seq, sm.ts, compressOk)) } else { - // Skip record for deleted msg. - em = encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq, 0) + if drOk { + if dr.First == 0 { + dr.First, dr.Num = seq, 1 + } else { + dr.Num++ + } + } else { + // Skip record for deleted msg. + sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq, 0)) + } } - // Place size in reply subject for flow control. - l := int64(len(em)) - reply := fmt.Sprintf(ackReplyT, l) - s.gcbAdd(&outb, l) - atomic.AddInt32(&outm, 1) - s.sendInternalMsgLocked(sendSubject, reply, nil, em) - spb++ + // Check if we are done. if seq == last { + // Need to see if we have a pending delete range. + if drOk && dr.First > 0 { + sendDR() + } s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) // EOF s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) @@ -8557,10 +8624,14 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { default: } } + if drOk && dr.First > 0 { + sendDR() + } + return true } - // Grab stream quit channel. + // Check is this stream got closed. mset.mu.RLock() qch := mset.qch mset.mu.RUnlock() @@ -8584,6 +8655,7 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { return case <-notActive.C: s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name()) + mset.clearCatchupPeer(sreq.Peer) return case <-nextBatchC: if !sendNextBatchAndContinue(qch) { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go index e9503b9d1a..02bf4bd873 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go @@ -1,4 +1,4 @@ -// Copyright 2019-2023 The NATS Authors +// Copyright 2019-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -239,7 +239,7 @@ func validateLeafNode(o *Options) error { } } else { if len(o.LeafNode.Users) != 0 { - return fmt.Errorf("operator mode does not allow specifying user in leafnode config") + return fmt.Errorf("operator mode does not allow specifying users in leafnode config") } for _, r := range o.LeafNode.Remotes { if !nkeys.IsValidPublicAccountKey(r.LocalAccount) { @@ -299,12 +299,12 @@ func validateLeafNode(o *Options) error { // with gateways. So if an option validation needs to be done regardless, // it MUST be done before this point! - if o.Gateway.Name == "" && o.Gateway.Port == 0 { + if o.Gateway.Name == _EMPTY_ && o.Gateway.Port == 0 { return nil } // If we are here we have both leaf nodes and gateways defined, make sure there // is a system account defined. - if o.SystemAccount == "" { + if o.SystemAccount == _EMPTY_ { return fmt.Errorf("leaf nodes and gateways (both being defined) require a system account to also be configured") } if err := validatePinnedCerts(o.LeafNode.TLSPinnedCerts); err != nil { @@ -334,6 +334,9 @@ func validateLeafNodeAuthOptions(o *Options) error { if o.LeafNode.Username != _EMPTY_ { return fmt.Errorf("can not have a single user/pass and a users array") } + if o.LeafNode.Nkey != _EMPTY_ { + return fmt.Errorf("can not have a single nkey and a users array") + } users := map[string]struct{}{} for _, u := range o.LeafNode.Users { if _, exists := users[u.Username]; exists { @@ -830,6 +833,19 @@ func (c *client) sendLeafConnect(clusterName string, headers bool) error { sig := base64.RawURLEncoding.EncodeToString(sigraw) cinfo.JWT = bytesToString(tmp) cinfo.Sig = sig + } else if nkey := c.leaf.remote.Nkey; nkey != _EMPTY_ { + kp, err := nkeys.FromSeed([]byte(nkey)) + if err != nil { + c.Errorf("Remote nkey has malformed seed") + return err + } + // Wipe our key on exit. + defer kp.Wipe() + sigraw, _ := kp.Sign(c.nonce) + sig := base64.RawURLEncoding.EncodeToString(sigraw) + pkey, _ := kp.PublicKey() + cinfo.Nkey = pkey + cinfo.Sig = sig } else if userInfo := c.leaf.remote.curURL.User; userInfo != nil { cinfo.User = userInfo.Username() cinfo.Pass, _ = userInfo.Password() @@ -839,7 +855,7 @@ func (c *client) sendLeafConnect(clusterName string, headers bool) error { } b, err := json.Marshal(cinfo) if err != nil { - c.Errorf("Error marshaling CONNECT to route: %v\n", err) + c.Errorf("Error marshaling CONNECT to remote leafnode: %v\n", err) return err } // Although this call is made before the writeLoop is created, @@ -1688,6 +1704,7 @@ func (s *Server) removeLeafNodeConnection(c *client) { // Connect information for solicited leafnodes. type leafConnectInfo struct { Version string `json:"version,omitempty"` + Nkey string `json:"nkey,omitempty"` JWT string `json:"jwt,omitempty"` Sig string `json:"sig,omitempty"` User string `json:"user,omitempty"` @@ -2169,6 +2186,28 @@ func (c *client) forceAddToSmap(subj string) { c.sendLeafNodeSubUpdate(subj, 1) } +// Used to force remove a subject from the subject map. +func (c *client) forceRemoveFromSmap(subj string) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.leaf.smap == nil { + return + } + n := c.leaf.smap[subj] + if n == 0 { + return + } + n-- + if n == 0 { + // Remove is now zero + delete(c.leaf.smap, subj) + c.sendLeafNodeSubUpdate(subj, 0) + } else { + c.leaf.smap[subj] = n + } +} + // Send the subscription interest change to the other side. // Lock should be held. func (c *client) sendLeafNodeSubUpdate(key string, n int32) { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go index 9853dd316f..8d324d01f7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go @@ -256,12 +256,42 @@ func (ms *memStore) SkipMsg() uint64 { if ms.state.Msgs == 0 { ms.state.FirstSeq = seq ms.state.FirstTime = now + } else { + ms.dmap.Insert(seq) } - ms.updateFirstSeq(seq) ms.mu.Unlock() return seq } +// Skip multiple msgs. +func (ms *memStore) SkipMsgs(seq uint64, num uint64) error { + // Grab time. + now := time.Now().UTC() + + ms.mu.Lock() + defer ms.mu.Unlock() + + // Check sequence matches our last sequence. + if seq != ms.state.LastSeq+1 { + if seq > 0 { + return ErrSequenceMismatch + } + seq = ms.state.LastSeq + 1 + } + lseq := seq + num - 1 + + ms.state.LastSeq = lseq + ms.state.LastTime = now + if ms.state.Msgs == 0 { + ms.state.FirstSeq, ms.state.FirstTime = lseq+1, now + } else { + for ; seq <= lseq; seq++ { + ms.dmap.Insert(seq) + } + } + return nil +} + // RegisterStorageUpdates registers a callback for updates to storage changes. // It will present number of messages and bytes as a signed integer and an // optional sequence number of the message if a single. @@ -1058,9 +1088,6 @@ func (ms *memStore) updateFirstSeq(seq uint64) { ms.dmap.Delete(seq) } } - if ms.dmap.IsEmpty() { - ms.dmap.SetInitialMin(ms.state.FirstSeq) - } } // Remove a seq from the fss and select new first. @@ -1186,13 +1213,16 @@ func (ms *memStore) State() StreamState { // Calculate interior delete details. if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 { - state.Deleted = make([]uint64, 0, state.NumDeleted) - // TODO(dlc) - Too Simplistic, once state is updated to allow runs etc redo. - for seq := state.FirstSeq + 1; seq < ms.state.LastSeq; seq++ { - if _, ok := ms.msgs[seq]; !ok { + state.Deleted = make([]uint64, 0, numDeleted) + fseq, lseq := state.FirstSeq, state.LastSeq + ms.dmap.Range(func(seq uint64) bool { + if seq < fseq || seq > lseq { + ms.dmap.Delete(seq) + } else { state.Deleted = append(state.Deleted, seq) } - } + return true + }) } if len(state.Deleted) > 0 { state.NumDeleted = len(state.Deleted) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go index 90f3f1de63..20e38a2922 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go @@ -3148,17 +3148,17 @@ func (t HealthZErrorType) MarshalJSON() ([]byte, error) { func (t *HealthZErrorType) UnmarshalJSON(data []byte) error { switch string(data) { - case jsonString("CONNECTION"): + case `"CONNECTION"`: *t = HealthzErrorConn - case jsonString("BAD_REQUEST"): + case `"BAD_REQUEST"`: *t = HealthzErrorBadRequest - case jsonString("JETSTREAM"): + case `"JETSTREAM"`: *t = HealthzErrorJetStream - case jsonString("ACCOUNT"): + case `"ACCOUNT"`: *t = HealthzErrorAccount - case jsonString("STREAM"): + case `"STREAM"`: *t = HealthzErrorStream - case jsonString("CONSUMER"): + case `"CONSUMER"`: *t = HealthzErrorConsumer default: return fmt.Errorf("unknown healthz error type %q", data) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go b/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go index 783af64bf5..e82ddb293e 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go @@ -186,17 +186,18 @@ const ( mqttInitialPubHeader = 16 // An overkill, should need 7 bytes max mqttProcessSubTooLong = 100 * time.Millisecond - mqttRetainedCacheTTL = 2 * time.Minute + mqttDefaultRetainedCacheTTL = 2 * time.Minute mqttRetainedTransferTimeout = 10 * time.Second ) var ( - mqttPingResponse = []byte{mqttPacketPingResp, 0x0} - mqttProtoName = []byte("MQTT") - mqttOldProtoName = []byte("MQIsdp") - mqttSessJailDur = mqttSessFlappingJailDur - mqttFlapCleanItvl = mqttSessFlappingCleanupInterval - mqttJSAPITimeout = 4 * time.Second + mqttPingResponse = []byte{mqttPacketPingResp, 0x0} + mqttProtoName = []byte("MQTT") + mqttOldProtoName = []byte("MQIsdp") + mqttSessJailDur = mqttSessFlappingJailDur + mqttFlapCleanItvl = mqttSessFlappingCleanupInterval + mqttJSAPITimeout = 4 * time.Second + mqttRetainedCacheTTL = mqttDefaultRetainedCacheTTL ) var ( @@ -217,7 +218,7 @@ var ( errMQTTEmptyUsername = errors.New("empty user name not allowed") errMQTTTopicIsEmpty = errors.New("topic cannot be empty") errMQTTPacketIdentifierIsZero = errors.New("packet identifier cannot be 0") - errMQTTUnsupportedCharacters = errors.New("characters ' ' and '.' not supported for MQTT topics") + errMQTTUnsupportedCharacters = errors.New("character ' ' not supported for MQTT topics") errMQTTInvalidSession = errors.New("invalid MQTT session") ) @@ -242,20 +243,25 @@ type mqttAccountSessionManager struct { flapTimer *time.Timer // Timer to perform some cleanup of the flappers map sl *Sublist // sublist allowing to find retained messages for given subscription retmsgs map[string]*mqttRetainedMsgRef // retained messages - rmsCache sync.Map // map[string(subject)]mqttRetainedMsg + rmsCache sync.Map // map[subject]*mqttRetainedMsg jsa mqttJSA rrmLastSeq uint64 // Restore retained messages expected last sequence rrmDoneCh chan struct{} // To notify the caller that all retained messages have been loaded domainTk string // Domain (with trailing "."), or possibly empty. This is added to session subject. } +type mqttJSAResponse struct { + reply string // will be used to map to the original request in jsa.NewRequestExMulti + value any +} + type mqttJSA struct { mu sync.Mutex id string c *client sendq *ipQueue[*mqttJSPubMsg] rplyr string - replies sync.Map + replies sync.Map // [string]chan *mqttJSAResponse nuid *nuid.NUID quitCh chan struct{} domain string // Domain or possibly empty. This is added to session subject. @@ -1232,7 +1238,7 @@ func (s *Server) mqttCreateAccountSessionManager(acc *Account, quitCh chan struc // Start the go routine that will clean up cached retained messages that expired. s.startGoRoutine(func() { defer s.grWG.Done() - as.cleaupRetainedMessageCache(s, closeCh) + as.cleanupRetainedMessageCache(s, closeCh) }) lookupStream := func(stream, txt string) (*StreamInfo, error) { @@ -1523,51 +1529,106 @@ func (jsa *mqttJSA) prefixDomain(subject string) string { return subject } -func (jsa *mqttJSA) newRequestEx(kind, subject, cidHash string, hdr int, msg []byte, timeout time.Duration) (interface{}, error) { - var sb strings.Builder - // Either we use nuid.Next() which uses a global lock, or our own nuid object, but - // then it needs to be "write" protected. This approach will reduce across account - // contention since we won't use the global nuid's lock. - jsa.mu.Lock() - uid := jsa.nuid.Next() - sb.WriteString(jsa.rplyr) - jsa.mu.Unlock() +func (jsa *mqttJSA) newRequestEx(kind, subject, cidHash string, hdr int, msg []byte, timeout time.Duration) (any, error) { + responses, err := jsa.newRequestExMulti(kind, subject, cidHash, []int{hdr}, [][]byte{msg}, timeout) + if err != nil { + return nil, err + } + if len(responses) != 1 { + return nil, fmt.Errorf("unreachable: invalid number of responses (%d)", len(responses)) + } + return responses[0].value, nil +} - sb.WriteString(kind) - sb.WriteByte(btsep) - if cidHash != _EMPTY_ { - sb.WriteString(cidHash) +// newRequestExMulti sends multiple messages on the same subject and waits for +// all responses. It returns the same number of responses in the same order as +// msgs parameter. In case of a timeout it returns an error as well as all +// responses received as a sparsely populated array, matching msgs, with nils +// for the values that have not yet been received. +// +// Note that each response may represent an error and should be inspected as +// such by the caller. +func (jsa *mqttJSA) newRequestExMulti(kind, subject, cidHash string, hdrs []int, msgs [][]byte, timeout time.Duration) ([]*mqttJSAResponse, error) { + if len(hdrs) != len(msgs) { + return nil, fmt.Errorf("unreachable: invalid number of messages (%d) or header offsets (%d)", len(msgs), len(hdrs)) + } + responseCh := make(chan *mqttJSAResponse, len(msgs)) + + // Generate and queue all outgoing requests, have all results reported to + // responseCh, and store a map of reply subjects to the original subjects' + // indices. + r2i := map[string]int{} + for i, msg := range msgs { + hdr := hdrs[i] + var sb strings.Builder + // Either we use nuid.Next() which uses a global lock, or our own nuid object, but + // then it needs to be "write" protected. This approach will reduce across account + // contention since we won't use the global nuid's lock. + jsa.mu.Lock() + uid := jsa.nuid.Next() + sb.WriteString(jsa.rplyr) + jsa.mu.Unlock() + + sb.WriteString(kind) sb.WriteByte(btsep) + if cidHash != _EMPTY_ { + sb.WriteString(cidHash) + sb.WriteByte(btsep) + } + sb.WriteString(uid) + reply := sb.String() + + // Add responseCh to the reply channel map. It will be cleaned out on + // timeout (see below), or in processJSAPIReplies upon receiving the + // response. + jsa.replies.Store(reply, responseCh) + + subject = jsa.prefixDomain(subject) + jsa.sendq.push(&mqttJSPubMsg{ + subj: subject, + reply: reply, + hdr: hdr, + msg: msg, + }) + r2i[reply] = i } - sb.WriteString(uid) - reply := sb.String() - ch := make(chan interface{}, 1) - jsa.replies.Store(reply, ch) - - subject = jsa.prefixDomain(subject) - jsa.sendq.push(&mqttJSPubMsg{ - subj: subject, - reply: reply, - hdr: hdr, - msg: msg, - }) - - var i interface{} - // We don't want to use time.After() which causes memory growth because the timer - // can't be stopped and will need to expire to then be garbage collected. + // Wait for all responses to come back, or for the timeout to expire. We + // don't want to use time.After() which causes memory growth because the + // timer can't be stopped and will need to expire to then be garbage + // collected. + c := 0 + responses := make([]*mqttJSAResponse, len(msgs)) + start := time.Now() t := time.NewTimer(timeout) - select { - case i = <-ch: - // Ensure we stop the timer so it can be quickly garbage collected. - t.Stop() - case <-jsa.quitCh: - return nil, ErrServerNotRunning - case <-t.C: - jsa.replies.Delete(reply) - return nil, fmt.Errorf("timeout for request type %q on %q (reply=%q)", kind, subject, reply) + defer t.Stop() + for { + select { + case r := <-responseCh: + i := r2i[r.reply] + responses[i] = r + c++ + if c == len(msgs) { + return responses, nil + } + + case <-jsa.quitCh: + return nil, ErrServerNotRunning + + case <-t.C: + var reply string + now := time.Now() + for reply = range r2i { // preserve the last value for Errorf + jsa.replies.Delete(reply) + } + + if len(msgs) == 1 { + return responses, fmt.Errorf("timeout after %v: request type %q on %q (reply=%q)", now.Sub(start), kind, subject, reply) + } else { + return responses, fmt.Errorf("timeout after %v: request type %q on %q: got %d out of %d", now.Sub(start), kind, subject, c, len(msgs)) + } + } } - return i, nil } func (jsa *mqttJSA) sendAck(ackSubject string) { @@ -1667,6 +1728,30 @@ func (jsa *mqttJSA) loadLastMsgFor(streamName string, subject string) (*StoredMs return lmr.Message, lmr.ToError() } +func (jsa *mqttJSA) loadLastMsgForMulti(streamName string, subjects []string) ([]*JSApiMsgGetResponse, error) { + marshaled := make([][]byte, 0, len(subjects)) + headerBytes := make([]int, 0, len(subjects)) + for _, subject := range subjects { + mreq := &JSApiMsgGetRequest{LastFor: subject} + bb, err := json.Marshal(mreq) + if err != nil { + return nil, err + } + marshaled = append(marshaled, bb) + headerBytes = append(headerBytes, 0) + } + + all, err := jsa.newRequestExMulti(mqttJSAMsgLoad, fmt.Sprintf(JSApiMsgGetT, streamName), _EMPTY_, headerBytes, marshaled, mqttJSAPITimeout) + // all has the same order as subjects, preserve it as we unmarshal + responses := make([]*JSApiMsgGetResponse, len(all)) + for i, v := range all { + if v != nil { + responses[i] = v.value.(*JSApiMsgGetResponse) + } + } + return responses, err +} + func (jsa *mqttJSA) loadNextMsgFor(streamName string, subject string) (*StoredMsg, error) { mreq := &JSApiMsgGetRequest{NextFor: subject} req, err := json.Marshal(mreq) @@ -1771,68 +1856,71 @@ func (as *mqttAccountSessionManager) processJSAPIReplies(_ *subscription, pc *cl return } jsa.replies.Delete(subject) - ch := chi.(chan interface{}) + ch := chi.(chan *mqttJSAResponse) + out := func(value any) { + ch <- &mqttJSAResponse{reply: subject, value: value} + } switch token { case mqttJSAStreamCreate: var resp = &JSApiStreamCreateResponse{} if err := json.Unmarshal(msg, resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAStreamUpdate: var resp = &JSApiStreamUpdateResponse{} if err := json.Unmarshal(msg, resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAStreamLookup: var resp = &JSApiStreamInfoResponse{} if err := json.Unmarshal(msg, &resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAStreamDel: var resp = &JSApiStreamDeleteResponse{} if err := json.Unmarshal(msg, &resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAConsumerCreate: var resp = &JSApiConsumerCreateResponse{} if err := json.Unmarshal(msg, resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAConsumerDel: var resp = &JSApiConsumerDeleteResponse{} if err := json.Unmarshal(msg, resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAMsgStore, mqttJSASessPersist: var resp = &JSPubAckResponse{} if err := json.Unmarshal(msg, resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAMsgLoad: var resp = &JSApiMsgGetResponse{} - if err := json.Unmarshal(msg, resp); err != nil { + if err := json.Unmarshal(msg, &resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAStreamNames: var resp = &JSApiStreamNamesResponse{} if err := json.Unmarshal(msg, resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) case mqttJSAMsgDelete: var resp = &JSApiMsgDeleteResponse{} if err := json.Unmarshal(msg, resp); err != nil { resp.Error = NewJSInvalidJSONError() } - ch <- resp + out(resp) default: pc.Warnf("Unknown reply code %q", token) } @@ -1856,8 +1944,8 @@ func (as *mqttAccountSessionManager) processRetainedMsg(_ *subscription, c *clie // At this point we either recover from our own server, or process a remote retained message. seq, _, _ := ackReplyInfo(reply) - // Handle this retained message - as.handleRetainedMsg(rm.Subject, &mqttRetainedMsgRef{sseq: seq}, rm) + // Handle this retained message, no need to copy the bytes. + as.handleRetainedMsg(rm.Subject, &mqttRetainedMsgRef{sseq: seq}, rm, false) // If we were recovering (lastSeq > 0), then check if we are done. if as.rrmLastSeq > 0 && seq >= as.rrmLastSeq { @@ -2001,7 +2089,7 @@ func (as *mqttAccountSessionManager) createSubscription(subject string, cb msgHa // only used when the server shutdown. // // No lock held on entry. -func (as *mqttAccountSessionManager) cleaupRetainedMessageCache(s *Server, closeCh chan struct{}) { +func (as *mqttAccountSessionManager) cleanupRetainedMessageCache(s *Server, closeCh chan struct{}) { tt := time.NewTicker(mqttRetainedCacheTTL) defer tt.Stop() for { @@ -2013,7 +2101,7 @@ func (as *mqttAccountSessionManager) cleaupRetainedMessageCache(s *Server, close i, maxScan := 0, 10*1000 now := time.Now() as.rmsCache.Range(func(key, value interface{}) bool { - rm := value.(mqttRetainedMsg) + rm := value.(*mqttRetainedMsg) if now.After(rm.expiresFromCache) { as.rmsCache.Delete(key) } @@ -2123,7 +2211,7 @@ func (as *mqttAccountSessionManager) sendJSAPIrequests(s *Server, c *client, acc // If a message for this topic already existed, the existing record is updated // with the provided information. // Lock not held on entry. -func (as *mqttAccountSessionManager) handleRetainedMsg(key string, rf *mqttRetainedMsgRef, rm *mqttRetainedMsg) { +func (as *mqttAccountSessionManager) handleRetainedMsg(key string, rf *mqttRetainedMsgRef, rm *mqttRetainedMsg, copyBytesToCache bool) { as.mu.Lock() defer as.mu.Unlock() if as.retmsgs == nil { @@ -2151,11 +2239,7 @@ func (as *mqttAccountSessionManager) handleRetainedMsg(key string, rf *mqttRetai // Update the in-memory retained message cache but only for messages // that are already in the cache, i.e. have been (recently) used. if rm != nil { - if _, ok := as.rmsCache.Load(key); ok { - toStore := *rm - toStore.expiresFromCache = time.Now().Add(mqttRetainedCacheTTL) - as.rmsCache.Store(key, toStore) - } + as.setCachedRetainedMsg(key, rm, true, copyBytesToCache) } return } @@ -2274,19 +2358,21 @@ func (as *mqttAccountSessionManager) removeSession(sess *mqttSession, lock bool) // Session lock held on entry. Acquires the subs lock and holds it for // the duration. Non-MQTT messages coming into mqttDeliverMsgCbQoS0 will be // waiting. +func (sess *mqttSession) processQOS12Sub( + c *client, // subscribing client. + subject, sid []byte, isReserved bool, qos byte, jsDurName string, h msgHandler, // subscription parameters. +) (*subscription, error) { + return sess.processSub(c, subject, sid, isReserved, qos, jsDurName, h, false, false, nil, false, nil) +} + func (sess *mqttSession) processSub( - // subscribing client. - c *client, - // subscription parameters. - subject, sid []byte, isReserved bool, qos byte, jsDurName string, h msgHandler, - // do we need to scan for shadow subscriptions? (we don't do it for QOS1+) - initShadow bool, - // len(rms) > 0 means to deliver retained messages for the subscription. - rms map[string]*mqttRetainedMsg, - // trace serialized retained messages in the log. - trace bool, - // the retained messages are kept in the account session manager. - as *mqttAccountSessionManager, + c *client, // subscribing client. + subject, sid []byte, isReserved bool, qos byte, jsDurName string, h msgHandler, // subscription parameters. + initShadow bool, // do we need to scan for shadow subscriptions? (not for QOS1+) + serializeRMS bool, // do we need to serialize RMS? + rms map[string]*mqttRetainedMsg, // preloaded rms (can be empty, or missing items if errors) + trace bool, // trace serialized retained messages in the log? + as *mqttAccountSessionManager, // needed only for rms serialization. ) (*subscription, error) { start := time.Now() defer func() { @@ -2350,6 +2436,10 @@ func (sess *mqttSession) processSub( func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, filters []*mqttFilter, fromSubProto, trace bool) ([]*subscription, error) { + c.mu.Lock() + acc := c.acc + c.mu.Unlock() + // Helper to determine if we need to create a separate top-level // subscription for a wildcard. fwc := func(subject string) (bool, string, string) { @@ -2364,29 +2454,7 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, return true, fwcsubject, fwcsid } - // Cache and a helper to load retained messages for a given subject. - rms := make(map[string]*mqttRetainedMsg) - loadRMS := func(subject []byte) error { - sub := &subscription{ - client: c, - subject: subject, - sid: subject, - } - c.mu.Lock() - acc := c.acc - c.mu.Unlock() - if err := c.addShadowSubscriptions(acc, sub, false); err != nil { - return err - } - // Best-effort loading the messages, logs on errors (to c.srv), loads - // once for subject. - as.loadRetainedMessagesForSubject(rms, subject, c.srv) - for _, ss := range sub.shadow { - as.loadRetainedMessagesForSubject(rms, ss.subject, c.srv) - } - return nil - } - + rmSubjects := map[string]struct{}{} // Preload retained messages for all requested subscriptions. Also, since // it's the first iteration over the filter list, do some cleanup. for _, f := range filters { @@ -2416,14 +2484,33 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, } } - // Load retained messages. + // Find retained messages. if fromSubProto { - if err := loadRMS([]byte(f.filter)); err != nil { + addRMSubjects := func(subject string) error { + sub := &subscription{ + client: c, + subject: []byte(subject), + sid: []byte(subject), + } + if err := c.addShadowSubscriptions(acc, sub, false); err != nil { + return err + } + + for _, sub := range append([]*subscription{sub}, sub.shadow...) { + as.addRetainedSubjectsForSubject(rmSubjects, bytesToString(sub.subject)) + for _, ss := range sub.shadow { + as.addRetainedSubjectsForSubject(rmSubjects, bytesToString(ss.subject)) + } + } + return nil + } + + if err := addRMSubjects(f.filter); err != nil { f.qos = mqttSubAckFailure continue } if need, subject, _ := fwc(f.filter); need { - if err := loadRMS([]byte(subject)); err != nil { + if err := addRMSubjects(subject); err != nil { f.qos = mqttSubAckFailure continue } @@ -2431,6 +2518,14 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, } } + serializeRMS := len(rmSubjects) > 0 + var rms map[string]*mqttRetainedMsg + if serializeRMS { + // Make the best effort to load retained messages. We will identify + // errors in the next pass. + rms = as.loadRetainedMessages(rmSubjects, c) + } + // Small helper to add the consumer config to the session. addJSConsToSess := func(sid string, cc *ConsumerConfig) { if cc == nil { @@ -2445,7 +2540,6 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, var err error subs := make([]*subscription, 0, len(filters)) for _, f := range filters { - // Skip what's already been identified as a failure. if f.qos == mqttSubAckFailure { continue @@ -2472,7 +2566,7 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, bsubject, bsid, isReserved, f.qos, // main subject _EMPTY_, mqttDeliverMsgCbQoS0, // no jsDur for QOS0 processShadowSubs, - rms, trace, as) // rms is empty if not fromSubProto + serializeRMS, rms, trace, as) sess.mu.Unlock() as.mu.Unlock() @@ -2506,7 +2600,7 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, []byte(fwcsubject), []byte(fwcsid), isReserved, f.qos, // FWC (top-level wildcard) subject _EMPTY_, mqttDeliverMsgCbQoS0, // no jsDur for QOS0 processShadowSubs, - rms, trace, as) // rms is empty if not fromSubProto + serializeRMS, rms, trace, as) sess.mu.Unlock() as.mu.Unlock() if err != nil { @@ -2532,6 +2626,7 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, subs = append(subs, sub) addJSConsToSess(sid, jscons) } + if fromSubProto { err = sess.update(filters, true) } @@ -2548,20 +2643,21 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, // Runs from the client's readLoop. // Account session manager lock held on entry. // Session lock held on entry. -func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(rms map[string]*mqttRetainedMsg, sess *mqttSession, c *client, sub *subscription, trace bool) { +func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(rms map[string]*mqttRetainedMsg, sess *mqttSession, c *client, sub *subscription, trace bool) error { if len(as.retmsgs) == 0 || len(rms) == 0 { - return + return nil } result := as.sl.ReverseMatch(string(sub.subject)) if len(result.psubs) == 0 { - return + return nil } + toTrace := []mqttPublish{} for _, psub := range result.psubs { rm := rms[string(psub.subject)] if rm == nil { - // This should not happen since we pre-load messages into the cache - // before calling serialize. + // This should not happen since we pre-load messages into rms before + // calling serialize. continue } var pi uint16 @@ -2591,65 +2687,90 @@ func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(rms map[string] sub.mqtt.prm = append(sub.mqtt.prm, headerBytes, rm.Msg) c.mu.Unlock() if trace { - pp := mqttPublish{ + toTrace = append(toTrace, mqttPublish{ topic: []byte(rm.Topic), flags: flags, pi: pi, sz: len(rm.Msg), - } - c.traceOutOp("PUBLISH", []byte(mqttPubTrace(&pp))) + }) } } + for _, pp := range toTrace { + c.traceOutOp("PUBLISH", []byte(mqttPubTrace(&pp))) + } + return nil } -// Returns in the provided slice all publish retained message records that +// Appends the stored message subjects for all retained message records that // match the given subscription's `subject` (which could have wildcards). // // Account session manager NOT lock held on entry. -func (as *mqttAccountSessionManager) loadRetainedMessagesForSubject(rms map[string]*mqttRetainedMsg, topSubject []byte, log Logger) { +func (as *mqttAccountSessionManager) addRetainedSubjectsForSubject(list map[string]struct{}, topSubject string) bool { as.mu.RLock() if len(as.retmsgs) == 0 { as.mu.RUnlock() - return + return false } - result := as.sl.ReverseMatch(string(topSubject)) + result := as.sl.ReverseMatch(topSubject) as.mu.RUnlock() - if len(result.psubs) == 0 { - return - } + added := false for _, sub := range result.psubs { subject := string(sub.subject) - if rms[subject] != nil { - continue // already loaded - } - - // See if we have the retained message in the cache. - if rmv, _ := as.rmsCache.Load(subject); rmv != nil { - rm := rmv.(mqttRetainedMsg) - rms[subject] = &rm + if _, ok := list[subject]; ok { continue } + list[subject] = struct{}{} + added = true + } - // Load the retained message from the stream, and cache it for reuse in - // the near future. - loadSubject := mqttRetainedMsgsStreamSubject + subject - jsm, err := as.jsa.loadLastMsgFor(mqttRetainedMsgsStreamName, loadSubject) - if err != nil || jsm == nil { - log.Warnf("failed to load retained message for subject %q: %v", loadSubject, err) + return added +} + +type warner interface { + Warnf(format string, v ...any) +} + +// Loads a list of retained messages given a list of stored message subjects. +func (as *mqttAccountSessionManager) loadRetainedMessages(subjects map[string]struct{}, w warner) map[string]*mqttRetainedMsg { + rms := make(map[string]*mqttRetainedMsg, len(subjects)) + ss := []string{} + for s := range subjects { + if rm := as.getCachedRetainedMsg(s); rm != nil { + rms[s] = rm + } else { + ss = append(ss, mqttRetainedMsgsStreamSubject+s) + } + } + + if len(ss) == 0 { + return rms + } + + results, err := as.jsa.loadLastMsgForMulti(mqttRetainedMsgsStreamName, ss) + // If an error occurred, warn, but then proceed with what we got. + if err != nil { + w.Warnf("error loading retained messages: %v", err) + } + for i, result := range results { + if result == nil { + continue // skip requests that timed out + } + if result.ToError() != nil { + w.Warnf("failed to load retained message for subject %q: %v", ss[i], err) continue } var rm mqttRetainedMsg - if err := json.Unmarshal(jsm.Data, &rm); err != nil { - log.Warnf("failed to decode retained message for subject %q: %v", loadSubject, err) + if err := json.Unmarshal(result.Message.Data, &rm); err != nil { + w.Warnf("failed to decode retained message for subject %q: %v", ss[i], err) continue } - - // Add the loaded retained message to the cache. - rm.expiresFromCache = time.Now().Add(mqttRetainedCacheTTL) - as.rmsCache.Store(subject, rm) - rms[subject] = &rm + // Add the loaded retained message to the cache, and to the results map. + key := ss[i][len(mqttRetainedMsgsStreamSubject):] + as.setCachedRetainedMsg(key, &rm, false, false) + rms[key] = &rm } + return rms } // Creates the session stream (limit msgs of 1) for this client ID if it does @@ -2833,6 +2954,32 @@ func (as *mqttAccountSessionManager) transferRetainedToPerKeySubjectStream(log * return nil } +func (as *mqttAccountSessionManager) getCachedRetainedMsg(subject string) *mqttRetainedMsg { + v, ok := as.rmsCache.Load(subject) + if !ok { + return nil + } + rm := v.(*mqttRetainedMsg) + if rm.expiresFromCache.Before(time.Now()) { + as.rmsCache.Delete(subject) + return nil + } + return rm +} + +func (as *mqttAccountSessionManager) setCachedRetainedMsg(subject string, rm *mqttRetainedMsg, onlyReplace bool, copyBytesToCache bool) { + rm.expiresFromCache = time.Now().Add(mqttRetainedCacheTTL) + if onlyReplace { + if _, ok := as.rmsCache.Load(subject); !ok { + return + } + } + if copyBytesToCache { + rm.Msg = copyBytes(rm.Msg) + } + as.rmsCache.Store(subject, rm) +} + ////////////////////////////////////////////////////////////////////////////// // // MQTT session related functions @@ -4026,7 +4173,7 @@ func (c *client) mqttHandlePubRetain() { sseq: smr.Sequence, } // Add/update the map - asm.handleRetainedMsg(key, rf, rm) + asm.handleRetainedMsg(key, rf, rm, true) // will copy the payload bytes if needs to update rmsCache } else { c.mu.Lock() acc := c.acc @@ -4893,9 +5040,8 @@ func (sess *mqttSession) processJSConsumer(c *client, subject, sid string, // for the JS durable's deliver subject. sess.mu.Lock() sess.tmaxack = tmaxack - sub, err := sess.processSub(c, []byte(inbox), []byte(inbox), - isMQTTReservedSubscription(subject), qos, cc.Durable, mqttDeliverMsgCbQoS12, - false, nil, false, nil) // no shadow subs, no retained message delivery + sub, err := sess.processQOS12Sub(c, []byte(inbox), []byte(inbox), + isMQTTReservedSubscription(subject), qos, cc.Durable, mqttDeliverMsgCbQoS12) sess.mu.Unlock() if err != nil { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/opts.go b/vendor/github.com/nats-io/nats-server/v2/server/opts.go index ff46afb0db..8925ef6df1 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/opts.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/opts.go @@ -147,6 +147,7 @@ type LeafNodeOpts struct { Port int `json:"port,omitempty"` Username string `json:"-"` Password string `json:"-"` + Nkey string `json:"-"` Account string `json:"-"` Users []*User `json:"-"` AuthTimeout float64 `json:"auth_timeout,omitempty"` @@ -192,6 +193,7 @@ type RemoteLeafOpts struct { NoRandomize bool `json:"-"` URLs []*url.URL `json:"urls,omitempty"` Credentials string `json:"-"` + Nkey string `json:"-"` SignatureCB SignatureHandler `json:"-"` TLS bool `json:"-"` TLSConfig *tls.Config `json:"-"` @@ -638,6 +640,7 @@ type authorization struct { user string pass string token string + nkey string acc string // Multiple Nkeys/Users nkeys []*NkeyUser @@ -669,6 +672,13 @@ type TLSConfigOpts struct { CertMatchBy certstore.MatchByType CertMatch string OCSPPeerConfig *certidp.OCSPPeerConfig + Certificates []*TLSCertPairOpt +} + +// TLSCertPairOpt are the paths to a certificate and private key. +type TLSCertPairOpt struct { + CertFile string + KeyFile string } // OCSPConfig represents the options of OCSP stapling options. @@ -2259,6 +2269,7 @@ func parseLeafNodes(v interface{}, opts *Options, errors *[]error, warnings *[]e opts.LeafNode.AuthTimeout = auth.timeout opts.LeafNode.Account = auth.acc opts.LeafNode.Users = auth.users + opts.LeafNode.Nkey = auth.nkey // Validate user info config for leafnode authorization if err := validateLeafNodeAuthOptions(opts); err != nil { *errors = append(*errors, &configErr{tk, err.Error()}) @@ -2344,6 +2355,12 @@ func parseLeafAuthorization(v interface{}, errors *[]error, warnings *[]error) ( auth.user = mv.(string) case "pass", "password": auth.pass = mv.(string) + case "nkey": + nk := mv.(string) + if !nkeys.IsValidPublicUserKey(nk) { + *errors = append(*errors, &configErr{tk, "Not a valid public nkey for leafnode authorization"}) + } + auth.nkey = nk case "timeout": at := float64(1) switch mv := mv.(type) { @@ -2489,7 +2506,24 @@ func parseRemoteLeafNodes(v interface{}, errors *[]error, warnings *[]error) ([] *errors = append(*errors, &configErr{tk, err.Error()}) continue } + // Can't have both creds and nkey + if remote.Nkey != _EMPTY_ { + *errors = append(*errors, &configErr{tk, "Remote leafnode can not have both creds and nkey defined"}) + continue + } remote.Credentials = p + case "nkey", "seed": + nk := v.(string) + if pb, _, err := nkeys.DecodeSeed([]byte(nk)); err != nil || pb != nkeys.PrefixByteUser { + err := &configErr{tk, fmt.Sprintf("Remote leafnode nkey is not a valid seed: %q", v)} + *errors = append(*errors, err) + continue + } + if remote.Credentials != _EMPTY_ { + *errors = append(*errors, &configErr{tk, "Remote leafnode can not have both creds and nkey defined"}) + continue + } + remote.Nkey = nk case "tls": tc, err := parseTLS(tk, true) if err != nil { @@ -4180,7 +4214,7 @@ func parseTLS(v interface{}, isClientCtx bool) (t *TLSConfigOpts, retErr error) ) defer convertPanicToError(<, &retErr) - _, v = unwrapValue(v, <) + tk, v := unwrapValue(v, <) tlsm = v.(map[string]interface{}) for mk, mv := range tlsm { tk, mv := unwrapValue(mv, <) @@ -4381,10 +4415,46 @@ func parseTLS(v interface{}, isClientCtx bool) (t *TLSConfigOpts, retErr error) default: return nil, &configErr{tk, fmt.Sprintf("error parsing ocsp peer config: unsupported type %T", v)} } + case "certs", "certificates": + certs, ok := mv.([]interface{}) + if !ok { + return nil, &configErr{tk, fmt.Sprintf("error parsing certificates config: unsupported type %T", v)} + } + tc.Certificates = make([]*TLSCertPairOpt, len(certs)) + for i, v := range certs { + tk, vv := unwrapValue(v, <) + pair, ok := vv.(map[string]interface{}) + if !ok { + return nil, &configErr{tk, fmt.Sprintf("error parsing certificates config: unsupported type %T", vv)} + } + certPair := &TLSCertPairOpt{} + for k, v := range pair { + tk, vv = unwrapValue(v, <) + file, ok := vv.(string) + if !ok { + return nil, &configErr{tk, fmt.Sprintf("error parsing certificates config: unsupported type %T", vv)} + } + switch k { + case "cert_file": + certPair.CertFile = file + case "key_file": + certPair.KeyFile = file + default: + return nil, &configErr{tk, fmt.Sprintf("error parsing tls certs config, unknown field %q", k)} + } + } + if certPair.CertFile == _EMPTY_ || certPair.KeyFile == _EMPTY_ { + return nil, &configErr{tk, "error parsing certificates config: both 'cert_file' and 'cert_key' options are required"} + } + tc.Certificates[i] = certPair + } default: - return nil, &configErr{tk, fmt.Sprintf("error parsing tls config, unknown field [%q]", mk)} + return nil, &configErr{tk, fmt.Sprintf("error parsing tls config, unknown field %q", mk)} } } + if len(tc.Certificates) > 0 && tc.CertFile != _EMPTY_ { + return nil, &configErr{tk, "error parsing tls config, cannot combine 'cert_file' option with 'certs' option"} + } // If cipher suites were not specified then use the defaults if tc.Ciphers == nil { @@ -4696,6 +4766,20 @@ func GenTLSConfig(tc *TLSConfigOpts) (*tls.Config, error) { if err != nil { return nil, err } + case tc.Certificates != nil: + // Multiple certificate support. + config.Certificates = make([]tls.Certificate, len(tc.Certificates)) + for i, certPair := range tc.Certificates { + cert, err := tls.LoadX509KeyPair(certPair.CertFile, certPair.KeyFile) + if err != nil { + return nil, fmt.Errorf("error parsing X509 certificate/key pair %d/%d: %v", i+1, len(tc.Certificates), err) + } + cert.Leaf, err = x509.ParseCertificate(cert.Certificate[0]) + if err != nil { + return nil, fmt.Errorf("error parsing certificate %d/%d: %v", i+1, len(tc.Certificates), err) + } + config.Certificates[i] = cert + } } // Require client certificates as needed diff --git a/vendor/github.com/nats-io/nats-server/v2/server/raft.go b/vendor/github.com/nats-io/nats-server/v2/server/raft.go index 83ee6f04f7..6990812c47 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/raft.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/raft.go @@ -25,11 +25,14 @@ import ( "net" "os" "path/filepath" + "runtime" "strings" "sync" "sync/atomic" "time" + "github.com/nats-io/nats-server/v2/internal/fastrand" + "github.com/minio/highwayhash" ) @@ -201,8 +204,6 @@ type raft struct { stepdown *ipQueue[string] // Stepdown requests leadc chan bool // Leader changes quit chan struct{} // Raft group shutdown - - prand *rand.Rand // Random generator, used to generate inboxes for instance } // cacthupState structure that holds our subscription, and catchup term and index @@ -348,9 +349,9 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe sq := s.sys.sq sacc := s.sys.account hash := s.sys.shash - pub := s.info.ID s.mu.RUnlock() + // Do this here to process error quicker. ps, err := readPeerState(cfg.Store) if err != nil { return nil, err @@ -360,12 +361,6 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe } qpfx := fmt.Sprintf("[ACC:%s] RAFT '%s' ", accName, cfg.Name) - rsrc := time.Now().UnixNano() - if len(pub) >= 32 { - if h, _ := highwayhash.New64([]byte(pub[:32])); h != nil { - rsrc += int64(h.Sum64()) - } - } n := &raft{ created: time.Now(), id: hash[:idLen], @@ -397,7 +392,6 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe leadc: make(chan bool, 1), observer: cfg.Observer, extSt: ps.domainExt, - prand: rand.New(rand.NewSource(rsrc)), } n.c.registerWithAccount(sacc) @@ -430,13 +424,19 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe n.setupLastSnapshot() } + truncateAndErr := func(index uint64) { + if err := n.wal.Truncate(index); err != nil { + n.setWriteErr(err) + } + } + // Retrieve the stream state from the WAL. If there are pending append // entries that were committed but not applied before we last shut down, // we will try to replay them and process them here. var state StreamState n.wal.FastState(&state) if state.Msgs > 0 { - // TODO(dlc) - Recover our state here. + n.debug("Replaying state of %d entries", state.Msgs) if first, err := n.loadFirstEntry(); err == nil { n.pterm, n.pindex = first.pterm, first.pindex if first.commit > 0 && first.commit > n.commit { @@ -444,31 +444,36 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe } } + // This process will queue up entries on our applied queue but prior to the upper + // state machine running. So we will monitor how much we have queued and if we + // reach a limit will pause the apply queue and resume inside of run() go routine. + const maxQsz = 32 * 1024 * 1024 // 32MB max + // It looks like there are entries we have committed but not applied // yet. Replay them. - for index := state.FirstSeq; index <= state.LastSeq; index++ { + for index, qsz := state.FirstSeq, 0; index <= state.LastSeq; index++ { ae, err := n.loadEntry(index) if err != nil { n.warn("Could not load %d from WAL [%+v]: %v", index, state, err) - if err := n.wal.Truncate(index); err != nil { - n.setWriteErrLocked(err) - } + truncateAndErr(index) break } if ae.pindex != index-1 { n.warn("Corrupt WAL, will truncate") - if err := n.wal.Truncate(index); err != nil { - n.setWriteErrLocked(err) - } + truncateAndErr(index) break } n.processAppendEntry(ae, nil) + // Check how much we have queued up so far to determine if we should pause. + for _, e := range ae.entries { + qsz += len(e.Data) + if qsz > maxQsz && !n.paused { + n.PauseApply() + } + } } } - // Send nil entry to signal the upper layers we are done doing replay/restore. - n.apply.push(nil) - // Make sure to track ourselves. n.peers[n.id] = &lps{time.Now().UnixNano(), 0, true} @@ -510,8 +515,9 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe labels["group"] = n.group s.registerRaftNode(n.group, n) - // Start the goroutines for the Raft state machine and the file writer. + // Start the run goroutine for the Raft state machine. s.startGoRoutine(n.run, labels) + // Start the filewriter. s.startGoRoutine(n.fileWriter) return n, nil @@ -887,8 +893,20 @@ func (n *raft) ResumeApply() { n.debug("Resuming %d replays", n.hcommit+1-n.commit) for index := n.commit + 1; index <= n.hcommit; index++ { if err := n.applyCommit(index); err != nil { + n.warn("Got error on apply commit during replay: %v", err) break } + // We want to unlock here to allow the upper layers to call Applied() without blocking. + n.Unlock() + // Give hint to let other Go routines run. + // Might not be necessary but seems to make it more fine grained interleaving. + runtime.Gosched() + // Simply re-acquire + n.Lock() + // Need to check if we got closed or if we were paused again. + if n.State() == Closed || n.paused { + return + } } } n.hcommit = 0 @@ -1034,7 +1052,7 @@ func (n *raft) InstallSnapshot(data []byte) error { sn := fmt.Sprintf(snapFileT, snap.lastTerm, snap.lastIndex) sfile := filepath.Join(snapDir, sn) - if err := os.WriteFile(sfile, n.encodeSnapshot(snap), 0640); err != nil { + if err := os.WriteFile(sfile, n.encodeSnapshot(snap), defaultFilePerms); err != nil { n.Unlock() // We could set write err here, but if this is a temporary situation, too many open files etc. // we want to retry and snapshots are not fatal. @@ -1659,7 +1677,7 @@ const ( // Lock should be held (due to use of random generator) func (n *raft) newCatchupInbox() string { var b [replySuffixLen]byte - rn := n.prand.Int63() + rn := fastrand.Uint64() for i, l := 0, rn; i < len(b); i++ { b[i] = digits[l%base] l /= base @@ -1669,7 +1687,7 @@ func (n *raft) newCatchupInbox() string { func (n *raft) newInbox() string { var b [replySuffixLen]byte - rn := n.prand.Int63() + rn := fastrand.Uint64() for i, l := 0, rn; i < len(b); i++ { b[i] = digits[l%base] l /= base @@ -1765,14 +1783,14 @@ func (n *raft) run() { // at least a route, leaf or gateway connection to be established before // starting the run loop. for gw := s.gateway; ; { - s.mu.Lock() - ready := s.numRemotes()+len(s.leafs) > 0 - if !ready && gw.enabled { + s.mu.RLock() + ready, gwEnabled := s.numRemotes()+len(s.leafs) > 0, gw.enabled + s.mu.RUnlock() + if !ready && gwEnabled { gw.RLock() ready = len(gw.out)+len(gw.in) > 0 gw.RUnlock() } - s.mu.Unlock() if !ready { select { case <-s.quitCh: @@ -1785,6 +1803,13 @@ func (n *raft) run() { } } + // We may have paused adding entries to apply queue, resume here. + // No-op if not paused. + n.ResumeApply() + + // Send nil entry to signal the upper layers we are done doing replay/restore. + n.apply.push(nil) + for s.isRunning() { switch n.State() { case Follower: @@ -3323,7 +3348,7 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { // Here we can become a leader but need to wait for resume of the apply queue. n.lxfer = true } - } else { + } else if n.vote != noVote { // Since we are here we are not the chosen one but we should clear any vote preference. n.vote = noVote n.writeTermVote() @@ -3616,7 +3641,7 @@ func (vr *voteRequest) encode() []byte { return buf[:voteRequestLen] } -func (n *raft) decodeVoteRequest(msg []byte, reply string) *voteRequest { +func decodeVoteRequest(msg []byte, reply string) *voteRequest { if len(msg) != voteRequestLen { return nil } @@ -3653,7 +3678,7 @@ func writePeerState(sd string, ps *peerState) error { if _, err := os.Stat(psf); err != nil && !os.IsNotExist(err) { return err } - if err := os.WriteFile(psf, encodePeerState(ps), 0640); err != nil { + if err := os.WriteFile(psf, encodePeerState(ps), defaultFilePerms); err != nil { return err } return nil @@ -3753,7 +3778,7 @@ func (n *raft) fileWriter() { copy(buf[0:], n.wtv) n.RUnlock() <-dios - err := os.WriteFile(tvf, buf[:], 0640) + err := os.WriteFile(tvf, buf[:], defaultFilePerms) dios <- struct{}{} if err != nil && !n.isClosed() { n.setWriteErr(err) @@ -3765,7 +3790,7 @@ func (n *raft) fileWriter() { buf := copyBytes(n.wps) n.RUnlock() <-dios - err := os.WriteFile(psf, buf, 0640) + err := os.WriteFile(psf, buf, defaultFilePerms) dios <- struct{}{} if err != nil && !n.isClosed() { n.setWriteErr(err) @@ -3818,7 +3843,7 @@ func (vr *voteResponse) encode() []byte { return buf[:voteResponseLen] } -func (n *raft) decodeVoteResponse(msg []byte) *voteResponse { +func decodeVoteResponse(msg []byte) *voteResponse { if len(msg) != voteResponseLen { return nil } @@ -3829,7 +3854,7 @@ func (n *raft) decodeVoteResponse(msg []byte) *voteResponse { } func (n *raft) handleVoteResponse(sub *subscription, c *client, _ *Account, _, reply string, msg []byte) { - vr := n.decodeVoteResponse(msg) + vr := decodeVoteResponse(msg) n.debug("Received a voteResponse %+v", vr) if vr == nil { n.error("Received malformed vote response for %q", n.group) @@ -3903,7 +3928,7 @@ func (n *raft) processVoteRequest(vr *voteRequest) error { } func (n *raft) handleVoteRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { - vr := n.decodeVoteRequest(msg, reply) + vr := decodeVoteRequest(msg, reply) if vr == nil { n.error("Received malformed vote request for %q", n.group) return diff --git a/vendor/github.com/nats-io/nats-server/v2/server/reload.go b/vendor/github.com/nats-io/nats-server/v2/server/reload.go index 4e1c2f71b5..065b9ec85e 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/reload.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/reload.go @@ -834,7 +834,7 @@ type profBlockRateReload struct { func (o *profBlockRateReload) Apply(s *Server) { s.setBlockProfileRate(o.newValue) - s.Noticef("Reloaded: block_prof_rate = %v", o.newValue) + s.Noticef("Reloaded: prof_block_rate = %v", o.newValue) } type leafNodeOption struct { @@ -1703,7 +1703,6 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { reloadClientTrcLvl = false reloadJetstream = false jsEnabled = false - reloadTLS = false isStatszChange = false co *clusterOption ) @@ -1718,9 +1717,6 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { if opt.IsAuthChange() { reloadAuth = true } - if opt.IsTLSChange() { - reloadTLS = true - } if opt.IsClusterPoolSizeOrAccountsChange() { co = opt.(*clusterOption) } @@ -1778,13 +1774,9 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { s.updateRemoteLeafNodesTLSConfig(newOpts) } - // This will fire if TLS enabled at root (NATS listener) -or- if ocsp or ocsp_cache - // appear in the config. - if reloadTLS { - // Restart OCSP monitoring. - if err := s.reloadOCSP(); err != nil { - s.Warnf("Can't restart OCSP features: %v", err) - } + // Always restart OCSP monitoring on reload. + if err := s.reloadOCSP(); err != nil { + s.Warnf("Can't restart OCSP features: %v", err) } s.Noticef("Reloaded server configuration") diff --git a/vendor/github.com/nats-io/nats-server/v2/server/server.go b/vendor/github.com/nats-io/nats-server/v2/server/server.go index b113890305..75d6f39395 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/server.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/server.go @@ -1,4 +1,4 @@ -// Copyright 2012-2022 The NATS Authors +// Copyright 2012-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -1117,9 +1117,14 @@ func (s *Server) configureAccounts(reloading bool) (map[string]struct{}, error) create = false } } + // Track old mappings if global account. + var oldGMappings []*mapping if create { if acc.Name == globalAccountName { a = s.gacc + a.mu.Lock() + oldGMappings = append(oldGMappings, a.mappings...) + a.mu.Unlock() } else { a = NewAccount(acc.Name) } @@ -1130,9 +1135,34 @@ func (s *Server) configureAccounts(reloading bool) (map[string]struct{}, error) // Will be a no-op in case of the global account since it is already registered. s.registerAccountNoLock(a) } + // The `acc` account is stored in options, not in the server, and these can be cleared. acc.sl, acc.clients, acc.mappings = nil, nil, nil + // Check here if we have been reloaded and we have a global account with mappings that may have changed. + // If we have leafnodes they need to be updated. + if reloading && a == s.gacc { + a.mu.Lock() + var mappings []*mapping + if len(a.mappings) > 0 && a.nleafs > 0 { + mappings = append(mappings, a.mappings...) + } + a.mu.Unlock() + if len(mappings) > 0 || len(oldGMappings) > 0 { + a.lmu.RLock() + for _, lc := range a.lleafs { + for _, em := range mappings { + lc.forceAddToSmap(em.src) + } + // Remove any old ones if needed. + for _, em := range oldGMappings { + lc.forceRemoveFromSmap(em.src) + } + } + a.lmu.RUnlock() + } + } + // If we see an account defined using $SYS we will make sure that is set as system account. if acc.Name == DEFAULT_SYSTEM_ACCOUNT && opts.SystemAccount == _EMPTY_ { opts.SystemAccount = DEFAULT_SYSTEM_ACCOUNT diff --git a/vendor/github.com/nats-io/nats-server/v2/server/store.go b/vendor/github.com/nats-io/nats-server/v2/server/store.go index 7b433c7f53..5c486b3e58 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/store.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/store.go @@ -1,4 +1,4 @@ -// Copyright 2019-2023 The NATS Authors +// Copyright 2019-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -15,7 +15,6 @@ package server import ( "encoding/binary" - "encoding/json" "errors" "fmt" "io" @@ -86,6 +85,7 @@ type StreamStore interface { StoreMsg(subject string, hdr, msg []byte) (uint64, int64, error) StoreRawMsg(subject string, hdr, msg []byte, seq uint64, ts int64) error SkipMsg() uint64 + SkipMsgs(seq uint64, num uint64) error LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) LoadNextMsg(filter string, wc bool, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) LoadLastMsg(subject string, sm *StoreMsg) (*StoreMsg, error) @@ -442,14 +442,16 @@ type TemplateStore interface { Delete(*streamTemplate) error } -func jsonString(s string) string { - return "\"" + s + "\"" -} - const ( - limitsPolicyString = "limits" - interestPolicyString = "interest" - workQueuePolicyString = "workqueue" + limitsPolicyJSONString = `"limits"` + interestPolicyJSONString = `"interest"` + workQueuePolicyJSONString = `"workqueue"` +) + +var ( + limitsPolicyJSONBytes = []byte(limitsPolicyJSONString) + interestPolicyJSONBytes = []byte(interestPolicyJSONString) + workQueuePolicyJSONBytes = []byte(workQueuePolicyJSONString) ) func (rp RetentionPolicy) String() string { @@ -468,11 +470,11 @@ func (rp RetentionPolicy) String() string { func (rp RetentionPolicy) MarshalJSON() ([]byte, error) { switch rp { case LimitsPolicy: - return json.Marshal(limitsPolicyString) + return limitsPolicyJSONBytes, nil case InterestPolicy: - return json.Marshal(interestPolicyString) + return interestPolicyJSONBytes, nil case WorkQueuePolicy: - return json.Marshal(workQueuePolicyString) + return workQueuePolicyJSONBytes, nil default: return nil, fmt.Errorf("can not marshal %v", rp) } @@ -480,11 +482,11 @@ func (rp RetentionPolicy) MarshalJSON() ([]byte, error) { func (rp *RetentionPolicy) UnmarshalJSON(data []byte) error { switch string(data) { - case jsonString(limitsPolicyString): + case limitsPolicyJSONString: *rp = LimitsPolicy - case jsonString(interestPolicyString): + case interestPolicyJSONString: *rp = InterestPolicy - case jsonString(workQueuePolicyString): + case workQueuePolicyJSONString: *rp = WorkQueuePolicy default: return fmt.Errorf("can not unmarshal %q", data) @@ -506,9 +508,9 @@ func (dp DiscardPolicy) String() string { func (dp DiscardPolicy) MarshalJSON() ([]byte, error) { switch dp { case DiscardOld: - return json.Marshal("old") + return []byte(`"old"`), nil case DiscardNew: - return json.Marshal("new") + return []byte(`"new"`), nil default: return nil, fmt.Errorf("can not marshal %v", dp) } @@ -516,9 +518,9 @@ func (dp DiscardPolicy) MarshalJSON() ([]byte, error) { func (dp *DiscardPolicy) UnmarshalJSON(data []byte) error { switch strings.ToLower(string(data)) { - case jsonString("old"): + case `"old"`: *dp = DiscardOld - case jsonString("new"): + case `"new"`: *dp = DiscardNew default: return fmt.Errorf("can not unmarshal %q", data) @@ -527,9 +529,15 @@ func (dp *DiscardPolicy) UnmarshalJSON(data []byte) error { } const ( - memoryStorageString = "memory" - fileStorageString = "file" - anyStorageString = "any" + memoryStorageJSONString = `"memory"` + fileStorageJSONString = `"file"` + anyStorageJSONString = `"any"` +) + +var ( + memoryStorageJSONBytes = []byte(memoryStorageJSONString) + fileStorageJSONBytes = []byte(fileStorageJSONString) + anyStorageJSONBytes = []byte(anyStorageJSONString) ) func (st StorageType) String() string { @@ -548,11 +556,11 @@ func (st StorageType) String() string { func (st StorageType) MarshalJSON() ([]byte, error) { switch st { case MemoryStorage: - return json.Marshal(memoryStorageString) + return memoryStorageJSONBytes, nil case FileStorage: - return json.Marshal(fileStorageString) + return fileStorageJSONBytes, nil case AnyStorage: - return json.Marshal(anyStorageString) + return anyStorageJSONBytes, nil default: return nil, fmt.Errorf("can not marshal %v", st) } @@ -560,11 +568,11 @@ func (st StorageType) MarshalJSON() ([]byte, error) { func (st *StorageType) UnmarshalJSON(data []byte) error { switch string(data) { - case jsonString(memoryStorageString): + case memoryStorageJSONString: *st = MemoryStorage - case jsonString(fileStorageString): + case fileStorageJSONString: *st = FileStorage - case jsonString(anyStorageString): + case anyStorageJSONString: *st = AnyStorage default: return fmt.Errorf("can not unmarshal %q", data) @@ -573,19 +581,25 @@ func (st *StorageType) UnmarshalJSON(data []byte) error { } const ( - ackNonePolicyString = "none" - ackAllPolicyString = "all" - ackExplicitPolicyString = "explicit" + ackNonePolicyJSONString = `"none"` + ackAllPolicyJSONString = `"all"` + ackExplicitPolicyJSONString = `"explicit"` +) + +var ( + ackNonePolicyJSONBytes = []byte(ackNonePolicyJSONString) + ackAllPolicyJSONBytes = []byte(ackAllPolicyJSONString) + ackExplicitPolicyJSONBytes = []byte(ackExplicitPolicyJSONString) ) func (ap AckPolicy) MarshalJSON() ([]byte, error) { switch ap { case AckNone: - return json.Marshal(ackNonePolicyString) + return ackNonePolicyJSONBytes, nil case AckAll: - return json.Marshal(ackAllPolicyString) + return ackAllPolicyJSONBytes, nil case AckExplicit: - return json.Marshal(ackExplicitPolicyString) + return ackExplicitPolicyJSONBytes, nil default: return nil, fmt.Errorf("can not marshal %v", ap) } @@ -593,11 +607,11 @@ func (ap AckPolicy) MarshalJSON() ([]byte, error) { func (ap *AckPolicy) UnmarshalJSON(data []byte) error { switch string(data) { - case jsonString(ackNonePolicyString): + case ackNonePolicyJSONString: *ap = AckNone - case jsonString(ackAllPolicyString): + case ackAllPolicyJSONString: *ap = AckAll - case jsonString(ackExplicitPolicyString): + case ackExplicitPolicyJSONString: *ap = AckExplicit default: return fmt.Errorf("can not unmarshal %q", data) @@ -606,16 +620,21 @@ func (ap *AckPolicy) UnmarshalJSON(data []byte) error { } const ( - replayInstantPolicyString = "instant" - replayOriginalPolicyString = "original" + replayInstantPolicyJSONString = `"instant"` + replayOriginalPolicyJSONString = `"original"` +) + +var ( + replayInstantPolicyJSONBytes = []byte(replayInstantPolicyJSONString) + replayOriginalPolicyJSONBytes = []byte(replayOriginalPolicyJSONString) ) func (rp ReplayPolicy) MarshalJSON() ([]byte, error) { switch rp { case ReplayInstant: - return json.Marshal(replayInstantPolicyString) + return replayInstantPolicyJSONBytes, nil case ReplayOriginal: - return json.Marshal(replayOriginalPolicyString) + return replayOriginalPolicyJSONBytes, nil default: return nil, fmt.Errorf("can not marshal %v", rp) } @@ -623,9 +642,9 @@ func (rp ReplayPolicy) MarshalJSON() ([]byte, error) { func (rp *ReplayPolicy) UnmarshalJSON(data []byte) error { switch string(data) { - case jsonString(replayInstantPolicyString): + case replayInstantPolicyJSONString: *rp = ReplayInstant - case jsonString(replayOriginalPolicyString): + case replayOriginalPolicyJSONString: *rp = ReplayOriginal default: return fmt.Errorf("can not unmarshal %q", data) @@ -634,28 +653,38 @@ func (rp *ReplayPolicy) UnmarshalJSON(data []byte) error { } const ( - deliverAllPolicyString = "all" - deliverLastPolicyString = "last" - deliverNewPolicyString = "new" - deliverByStartSequenceString = "by_start_sequence" - deliverByStartTimeString = "by_start_time" - deliverLastPerPolicyString = "last_per_subject" - deliverUndefinedString = "undefined" + deliverAllPolicyJSONString = `"all"` + deliverLastPolicyJSONString = `"last"` + deliverNewPolicyJSONString = `"new"` + deliverByStartSequenceJSONString = `"by_start_sequence"` + deliverByStartTimeJSONString = `"by_start_time"` + deliverLastPerPolicyJSONString = `"last_per_subject"` + deliverUndefinedJSONString = `"undefined"` +) + +var ( + deliverAllPolicyJSONBytes = []byte(deliverAllPolicyJSONString) + deliverLastPolicyJSONBytes = []byte(deliverLastPolicyJSONString) + deliverNewPolicyJSONBytes = []byte(deliverNewPolicyJSONString) + deliverByStartSequenceJSONBytes = []byte(deliverByStartSequenceJSONString) + deliverByStartTimeJSONBytes = []byte(deliverByStartTimeJSONString) + deliverLastPerPolicyJSONBytes = []byte(deliverLastPerPolicyJSONString) + deliverUndefinedJSONBytes = []byte(deliverUndefinedJSONString) ) func (p *DeliverPolicy) UnmarshalJSON(data []byte) error { switch string(data) { - case jsonString(deliverAllPolicyString), jsonString(deliverUndefinedString): + case deliverAllPolicyJSONString, deliverUndefinedJSONString: *p = DeliverAll - case jsonString(deliverLastPolicyString): + case deliverLastPolicyJSONString: *p = DeliverLast - case jsonString(deliverLastPerPolicyString): + case deliverLastPerPolicyJSONString: *p = DeliverLastPerSubject - case jsonString(deliverNewPolicyString): + case deliverNewPolicyJSONString: *p = DeliverNew - case jsonString(deliverByStartSequenceString): + case deliverByStartSequenceJSONString: *p = DeliverByStartSequence - case jsonString(deliverByStartTimeString): + case deliverByStartTimeJSONString: *p = DeliverByStartTime default: return fmt.Errorf("can not unmarshal %q", data) @@ -667,19 +696,19 @@ func (p *DeliverPolicy) UnmarshalJSON(data []byte) error { func (p DeliverPolicy) MarshalJSON() ([]byte, error) { switch p { case DeliverAll: - return json.Marshal(deliverAllPolicyString) + return deliverAllPolicyJSONBytes, nil case DeliverLast: - return json.Marshal(deliverLastPolicyString) + return deliverLastPolicyJSONBytes, nil case DeliverLastPerSubject: - return json.Marshal(deliverLastPerPolicyString) + return deliverLastPerPolicyJSONBytes, nil case DeliverNew: - return json.Marshal(deliverNewPolicyString) + return deliverNewPolicyJSONBytes, nil case DeliverByStartSequence: - return json.Marshal(deliverByStartSequenceString) + return deliverByStartSequenceJSONBytes, nil case DeliverByStartTime: - return json.Marshal(deliverByStartTimeString) + return deliverByStartTimeJSONBytes, nil default: - return json.Marshal(deliverUndefinedString) + return deliverUndefinedJSONBytes, nil } } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stream.go b/vendor/github.com/nats-io/nats-server/v2/server/stream.go index 4bdd4868e5..f032f285b2 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stream.go @@ -1,4 +1,4 @@ -// Copyright 2019-2023 The NATS Authors +// Copyright 2019-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -29,6 +29,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/klauspost/compress/s2" @@ -219,7 +220,7 @@ type stream struct { srv *Server client *client sysc *client - sid int + sid atomic.Uint64 pubAck []byte outq *jsOutQ msgs *ipQueue[*inMsg] @@ -242,7 +243,7 @@ type stream struct { mqch chan struct{} active bool ddloaded bool - closed bool + closed atomic.Bool // Mirror mirror *sourceInfo @@ -276,7 +277,7 @@ type stream struct { // Clustered mode. sa *streamAssignment node RaftNode - catchup bool + catchup atomic.Bool syncSub *subscription infoSub *subscription clMu sync.Mutex @@ -309,6 +310,7 @@ type sourceInfo struct { start time.Time lag uint64 err *ApiError + fails int last time.Time lreq time.Time qch chan struct{} @@ -368,9 +370,7 @@ type ddentry struct { } // Replicas Range -const ( - StreamMaxReplicas = 5 -) +const StreamMaxReplicas = 5 // AddStream adds a stream for the given account. func (a *Account) addStream(config *StreamConfig) (*stream, error) { @@ -1254,6 +1254,7 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account) (StreamConfi } return exists, cfg } + hasStream := func(streamName string) (bool, int32, []string) { exists, cfg := getStream(streamName) return exists, cfg.MaxMsgSize, cfg.Subjects @@ -1632,13 +1633,7 @@ func (jsa *jsAccount) configUpdateCheck(old, new *StreamConfig, s *Server) (*Str // Save the user configured MaxBytes. newMaxBytes := cfg.MaxBytes - maxBytesOffset := int64(0) - if old.MaxBytes > 0 { - if excessRep := cfg.Replicas - old.Replicas; excessRep > 0 { - maxBytesOffset = old.MaxBytes * int64(excessRep) - } - } // We temporarily set cfg.MaxBytes to maxBytesDiff because checkAllLimits // adds cfg.MaxBytes to the current reserved limit and checks if we've gone @@ -1670,7 +1665,11 @@ func (jsa *jsAccount) configUpdateCheck(old, new *StreamConfig, s *Server) (*Str _, reserved = tieredStreamAndReservationCount(js.cluster.streams[acc.Name], tier, &cfg) } // reservation does not account for this stream, hence add the old value - reserved += int64(old.Replicas) * old.MaxBytes + if tier == _EMPTY_ && old.Replicas > 1 { + reserved += old.MaxBytes * int64(old.Replicas) + } else { + reserved += old.MaxBytes + } if err := js.checkAllLimits(&selected, &cfg, reserved, maxBytesOffset); err != nil { return nil, err } @@ -1774,6 +1773,8 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool) // Check for Sources. if len(cfg.Sources) > 0 || len(ocfg.Sources) > 0 { currentIName := make(map[string]struct{}) + needsStartingSeqNum := make(map[string]struct{}) + for _, s := range ocfg.Sources { currentIName[s.iname] = struct{}{} } @@ -1807,18 +1808,25 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool) } mset.sources[s.iname] = si - mset.setStartingSequenceForSource(s.iname, s.External) - mset.setSourceConsumer(s.iname, si.sseq+1, time.Time{}) + needsStartingSeqNum[s.iname] = struct{}{} } else { // source already exists delete(currentIName, s.iname) } } - // What is left in cuurentIName needs to be deleted. + // What is left in currentIName needs to be deleted. for iName := range currentIName { mset.cancelSourceConsumer(iName) delete(mset.sources, iName) } + neededCopy := make(map[string]struct{}, len(needsStartingSeqNum)) + for iName := range needsStartingSeqNum { + neededCopy[iName] = struct{}{} + } + mset.setStartingSequenceForSources(needsStartingSeqNum) + for iName := range neededCopy { + mset.setSourceConsumer(iName, mset.sources[iName].sseq+1, time.Time{}) + } } } @@ -1957,9 +1965,9 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool) // Purge will remove all messages from the stream and underlying store based on the request. func (mset *stream) purge(preq *JSApiStreamPurgeRequest) (purged uint64, err error) { mset.mu.RLock() - if mset.client == nil || mset.store == nil { + if mset.closed.Load() { mset.mu.RUnlock() - return 0, errors.New("invalid stream") + return 0, errStreamClosed } if mset.cfg.Sealed { mset.mu.RUnlock() @@ -2001,11 +2009,10 @@ func (mset *stream) purge(preq *JSApiStreamPurgeRequest) (purged uint64, err err // no subject was specified, we can purge all consumers sequences doPurge := preq == nil || preq.Subject == _EMPTY_ || - // or consumer filter subject is equal to purged subject - preq.Subject == o.cfg.FilterSubject || - // or consumer subject is subset of purged subject, + // consumer filter subject is equal to purged subject + // or consumer filter subject is subset of purged subject, // but not the other way around. - subjectIsSubsetMatch(o.cfg.FilterSubject, preq.Subject) + o.isEqualOrSubsetMatch(preq.Subject) o.mu.RUnlock() if doPurge { o.purge(fseq, lseq) @@ -2025,24 +2032,17 @@ func (mset *stream) removeMsg(seq uint64) (bool, error) { // DeleteMsg will remove a message from a stream. func (mset *stream) deleteMsg(seq uint64) (bool, error) { - mset.mu.RLock() - if mset.client == nil { - mset.mu.RUnlock() - return false, fmt.Errorf("invalid stream") + if mset.closed.Load() { + return false, errStreamClosed } - mset.mu.RUnlock() - return mset.store.RemoveMsg(seq) } // EraseMsg will securely remove a message and rewrite the data with random data. func (mset *stream) eraseMsg(seq uint64) (bool, error) { - mset.mu.RLock() - if mset.client == nil { - mset.mu.RUnlock() - return false, fmt.Errorf("invalid stream") + if mset.closed.Load() { + return false, errStreamClosed } - mset.mu.RUnlock() return mset.store.EraseMsg(seq) } @@ -2370,34 +2370,56 @@ func (mset *stream) retryMirrorConsumer() error { // Lock should be held. func (mset *stream) skipMsgs(start, end uint64) { node, store := mset.node, mset.store + // If we are not clustered we can short circuit now with store.SkipMsgs + if node == nil { + store.SkipMsgs(start, end-start+1) + mset.lseq = end + return + } + + // FIXME (dlc) - We should allow proposals of DeleteEange, but would need to make sure all peers support. + // With syncRequest was easy to add bool into request. var entries []*Entry for seq := start; seq <= end; seq++ { - if node != nil { - entries = append(entries, &Entry{EntryNormal, encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq-1, 0)}) - // So a single message does not get too big. - if len(entries) > 10_000 { - node.ProposeDirect(entries) - // We need to re-craete `entries` because there is a reference - // to it in the node's pae map. - entries = entries[:0] - } - } else { - mset.lseq = store.SkipMsg() + entries = append(entries, &Entry{EntryNormal, encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq-1, 0)}) + // So a single message does not get too big. + if len(entries) > 10_000 { + node.ProposeDirect(entries) + // We need to re-create `entries` because there is a reference + // to it in the node's pae map. + entries = entries[:0] } } // Send all at once. - if node != nil && len(entries) > 0 { + if len(entries) > 0 { node.ProposeDirect(entries) } } +const ( + // Base retry backoff duration. + retryBackOff = 5 * time.Second + // Maximum amount we will wait. + retryMaximum = 2 * time.Minute +) + +// Calculate our backoff based on number of failures. +func calculateRetryBackoff(fails int) time.Duration { + backoff := time.Duration(retryBackOff) * time.Duration(fails*2) + if backoff > retryMaximum { + backoff = retryMaximum + } + return backoff +} + // This will schedule a call to setupMirrorConsumer, taking into account the last -// time it was retried and determine the soonest setSourceConsumer can be called -// without tripping the sourceConsumerRetryThreshold. +// time it was retried and determine the soonest setupMirrorConsumer can be called +// without tripping the sourceConsumerRetryThreshold. We will also take into account +// number of failures and will back off our retries. // The mset.mirror pointer has been verified to be not nil by the caller. // // Lock held on entry -func (mset *stream) scheduleSetupMirrorConsumerRetryAsap() { +func (mset *stream) scheduleSetupMirrorConsumerRetry() { // We are trying to figure out how soon we can retry. setupMirrorConsumer will reject // a retry if last was done less than "sourceConsumerRetryThreshold" ago. next := sourceConsumerRetryThreshold - time.Since(mset.mirror.lreq) @@ -2405,9 +2427,12 @@ func (mset *stream) scheduleSetupMirrorConsumerRetryAsap() { // It means that we have passed the threshold and so we are ready to go. next = 0 } - // To make *sure* that the next request will not fail, add a bit of buffer - // and some randomness. - next += time.Duration(rand.Intn(int(10*time.Millisecond))) + 10*time.Millisecond + // Take into account failures here. + next += calculateRetryBackoff(mset.mirror.fails) + + // Add some jitter. + next += time.Duration(rand.Intn(int(100*time.Millisecond))) + 100*time.Millisecond + time.AfterFunc(next, func() { mset.mu.Lock() mset.setupMirrorConsumer() @@ -2418,6 +2443,9 @@ func (mset *stream) scheduleSetupMirrorConsumerRetryAsap() { // Setup our mirror consumer. // Lock should be held. func (mset *stream) setupMirrorConsumer() error { + if mset.closed.Load() { + return errStreamClosed + } if mset.outq == nil { return errors.New("outq required") } @@ -2449,7 +2477,7 @@ func (mset *stream) setupMirrorConsumer() error { // We want to throttle here in terms of how fast we request new consumers, // or if the previous is still in progress. if last := time.Since(mirror.lreq); last < sourceConsumerRetryThreshold || mirror.sip { - mset.scheduleSetupMirrorConsumerRetryAsap() + mset.scheduleSetupMirrorConsumerRetry() return nil } mirror.lreq = time.Now() @@ -2506,27 +2534,28 @@ func (mset *stream) setupMirrorConsumer() error { mirror.sf = mset.cfg.Mirror.FilterSubject } - sfs := make([]string, len(mset.cfg.Mirror.SubjectTransforms)) - trs := make([]*subjectTransform, len(mset.cfg.Mirror.SubjectTransforms)) + if lst := len(mset.cfg.Mirror.SubjectTransforms); lst > 0 { + sfs := make([]string, lst) + trs := make([]*subjectTransform, lst) - for i, tr := range mset.cfg.Mirror.SubjectTransforms { - // will not fail as already checked before that the transform will work - subjectTransform, err := NewSubjectTransform(tr.Source, tr.Destination) - if err != nil { - mset.srv.Errorf("Unable to get transform for mirror consumer: %v", err) + for i, tr := range mset.cfg.Mirror.SubjectTransforms { + // will not fail as already checked before that the transform will work + subjectTransform, err := NewSubjectTransform(tr.Source, tr.Destination) + if err != nil { + mset.srv.Errorf("Unable to get transform for mirror consumer: %v", err) + } + sfs[i] = tr.Source + trs[i] = subjectTransform } - - sfs[i] = tr.Source - trs[i] = subjectTransform + mirror.sfs = sfs + mirror.trs = trs + req.Config.FilterSubjects = sfs } - mirror.sfs = sfs - mirror.trs = trs - req.Config.FilterSubjects = sfs respCh := make(chan *JSApiConsumerCreateResponse, 1) reply := infoReplySubject() crSub, err := mset.subscribeInternal(reply, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - mset.unsubscribeUnlocked(sub) + mset.unsubscribe(sub) _, msg := c.msgParts(rmsg) var ccr JSApiConsumerCreateResponse @@ -2535,11 +2564,14 @@ func (mset *stream) setupMirrorConsumer() error { mset.setMirrorErr(ApiErrors[JSInvalidJSONErr]) return } - respCh <- &ccr + select { + case respCh <- &ccr: + default: + } }) if err != nil { mirror.err = NewJSMirrorConsumerSetupFailedError(err, Unless(err)) - mset.scheduleSetupMirrorConsumerRetryAsap() + mset.scheduleSetupMirrorConsumerRetry() return nil } @@ -2557,26 +2589,9 @@ func (mset *stream) setupMirrorConsumer() error { subject = strings.ReplaceAll(subject, "..", ".") } - // We need to create the subscription that will receive the messages prior - // to sending the consumer create request, because in some complex topologies - // with gateways and optimistic mode, it is possible that the consumer starts - // delivering messages as soon as the consumer request is received. - qname := fmt.Sprintf("[ACC:%s] stream mirror '%s' of '%s' msgs", mset.acc.Name, mset.cfg.Name, mset.cfg.Mirror.Name) - // Create a new queue each time - mirror.msgs = newIPQueue[*inMsg](mset.srv, qname) - msgs := mirror.msgs - sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. - mset.queueInbound(msgs, subject, reply, hdr, msg) - }) - if err != nil { - mirror.err = NewJSMirrorConsumerSetupFailedError(err, Unless(err)) - mset.unsubscribeUnlocked(crSub) - mset.scheduleSetupMirrorConsumerRetryAsap() - return nil - } + // Reset + mirror.msgs = nil mirror.err = nil - mirror.sub = sub mirror.sip = true // Send the consumer create request @@ -2592,7 +2607,13 @@ func (mset *stream) setupMirrorConsumer() error { mset.mirror.sip = false // If we need to retry, schedule now if retry { - mset.scheduleSetupMirrorConsumerRetryAsap() + mset.mirror.fails++ + // Cancel here since we can not do anything with this consumer at this point. + mset.cancelSourceInfo(mset.mirror) + mset.scheduleSetupMirrorConsumerRetry() + } else { + // Clear on success. + mset.mirror.fails = 0 } } mset.mu.Unlock() @@ -2618,7 +2639,26 @@ func (mset *stream) setupMirrorConsumer() error { mirror.err = ccr.Error // Let's retry as soon as possible, but we are gated by sourceConsumerRetryThreshold retry = true + mset.mu.Unlock() + return } else { + // Setup actual subscription to process messages from our source. + qname := fmt.Sprintf("[ACC:%s] stream mirror '%s' of '%s' msgs", mset.acc.Name, mset.cfg.Name, mset.cfg.Mirror.Name) + // Create a new queue each time + mirror.msgs = newIPQueue[*inMsg](mset.srv, qname) + msgs := mirror.msgs + sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { + hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. + mset.queueInbound(msgs, subject, reply, hdr, msg) + }) + if err != nil { + mirror.err = NewJSMirrorConsumerSetupFailedError(err, Unless(err)) + retry = true + mset.mu.Unlock() + return + } + // Save our sub. + mirror.sub = sub // When an upstream stream expires messages or in general has messages that we want // that are no longer available we need to adjust here. @@ -2659,7 +2699,7 @@ func (mset *stream) setupMirrorConsumer() error { mset.mu.Unlock() ready.Wait() case <-time.After(5 * time.Second): - mset.unsubscribeUnlocked(crSub) + mset.unsubscribe(crSub) // We already waited 5 seconds, let's retry now. retry = true } @@ -2687,7 +2727,10 @@ func (mset *stream) retrySourceConsumer(iName string) { } var ss = mset.streamSource(iName) if ss != nil { - mset.setStartingSequenceForSource(iName, ss.External) + iNameMap := map[string]struct{}{ + iName: {}, + } + mset.setStartingSequenceForSources(iNameMap) mset.retrySourceConsumerAtSeq(iName, si.sseq+1) } } @@ -2734,8 +2777,10 @@ func (mset *stream) cancelSourceInfo(si *sourceInfo) { close(si.qch) si.qch = nil } - si.msgs.drain() - si.msgs.unregister() + if si.msgs != nil { + si.msgs.drain() + si.msgs.unregister() + } } const sourceConsumerRetryThreshold = 2 * time.Second @@ -2745,7 +2790,7 @@ const sourceConsumerRetryThreshold = 2 * time.Second // without tripping the sourceConsumerRetryThreshold. // // Lock held on entry -func (mset *stream) scheduleSetSourceConsumerRetryAsap(si *sourceInfo, seq uint64, startTime time.Time) { +func (mset *stream) scheduleSetSourceConsumerRetry(si *sourceInfo, seq uint64, startTime time.Time) { // We are trying to figure out how soon we can retry. setSourceConsumer will reject // a retry if last was done less than "sourceConsumerRetryThreshold" ago. next := sourceConsumerRetryThreshold - time.Since(si.lreq) @@ -2753,16 +2798,19 @@ func (mset *stream) scheduleSetSourceConsumerRetryAsap(si *sourceInfo, seq uint6 // It means that we have passed the threshold and so we are ready to go. next = 0 } + // Take into account failures here. + next += calculateRetryBackoff(si.fails) + // To make *sure* that the next request will not fail, add a bit of buffer // and some randomness. next += time.Duration(rand.Intn(int(10*time.Millisecond))) + 10*time.Millisecond - mset.scheduleSetSourceConsumerRetry(si.iname, seq, next, startTime) + mset.scheduleSetSourceConsumer(si.iname, seq, next, startTime) } // Simply schedules setSourceConsumer at the given delay. // // Lock held on entry -func (mset *stream) scheduleSetSourceConsumerRetry(iname string, seq uint64, delay time.Duration, startTime time.Time) { +func (mset *stream) scheduleSetSourceConsumer(iname string, seq uint64, delay time.Duration, startTime time.Time) { if mset.sourceRetries == nil { mset.sourceRetries = map[string]*time.Timer{} } @@ -2784,6 +2832,11 @@ func (mset *stream) scheduleSetSourceConsumerRetry(iname string, seq uint64, del // Lock should be held. func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.Time) { + // Ignore if closed. + if mset.closed.Load() { + return + } + si := mset.sources[iname] if si == nil { return @@ -2799,7 +2852,7 @@ func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.T // We want to throttle here in terms of how fast we request new consumers, // or if the previous is still in progress. if last := time.Since(si.lreq); last < sourceConsumerRetryThreshold || si.sip { - mset.scheduleSetSourceConsumerRetryAsap(si, seq, startTime) + mset.scheduleSetSourceConsumerRetry(si, seq, startTime) return } si.lreq = time.Now() @@ -2867,18 +2920,21 @@ func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.T respCh := make(chan *JSApiConsumerCreateResponse, 1) reply := infoReplySubject() crSub, err := mset.subscribeInternal(reply, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - mset.unsubscribeUnlocked(sub) + mset.unsubscribe(sub) _, msg := c.msgParts(rmsg) var ccr JSApiConsumerCreateResponse if err := json.Unmarshal(msg, &ccr); err != nil { c.Warnf("JetStream bad source consumer create response: %q", msg) return } - respCh <- &ccr + select { + case respCh <- &ccr: + default: + } }) if err != nil { si.err = NewJSSourceConsumerSetupFailedError(err, Unless(err)) - mset.scheduleSetSourceConsumerRetryAsap(si, seq, startTime) + mset.scheduleSetSourceConsumerRetry(si, seq, startTime) return } @@ -2904,26 +2960,9 @@ func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.T // Marshal request. b, _ := json.Marshal(req) - // We need to create the subscription that will receive the messages prior - // to sending the consumer create request, because in some complex topologies - // with gateways and optimistic mode, it is possible that the consumer starts - // delivering messages as soon as the consumer request is received. - qname := fmt.Sprintf("[ACC:%s] stream source '%s' from '%s' msgs", mset.acc.Name, mset.cfg.Name, si.name) - // Create a new queue each time - si.msgs = newIPQueue[*inMsg](mset.srv, qname) - msgs := si.msgs - sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. - mset.queueInbound(msgs, subject, reply, hdr, msg) - }) - if err != nil { - si.err = NewJSSourceConsumerSetupFailedError(err, Unless(err)) - mset.unsubscribeUnlocked(crSub) - mset.scheduleSetSourceConsumerRetryAsap(si, seq, startTime) - return - } + // Reset + si.msgs = nil si.err = nil - si.sub = sub si.sip = true // Send the consumer create request @@ -2939,7 +2978,13 @@ func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.T si.sip = false // If we need to retry, schedule now if retry { - mset.scheduleSetSourceConsumerRetryAsap(si, seq, startTime) + si.fails++ + // Cancel here since we can not do anything with this consumer at this point. + mset.cancelSourceInfo(si) + mset.scheduleSetSourceConsumerRetry(si, seq, startTime) + } else { + // Clear on success. + si.fails = 0 } } mset.mu.Unlock() @@ -2954,7 +2999,7 @@ func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.T ready := sync.WaitGroup{} mset.mu.Lock() // Check that it has not been removed or canceled (si.sub would be nil) - if si := mset.sources[iname]; si != nil && si.sub != nil { + if si := mset.sources[iname]; si != nil { si.err = nil if ccr.Error != nil || ccr.ConsumerInfo == nil { // Note: this warning can happen a few times when starting up the server when sourcing streams are @@ -2964,7 +3009,27 @@ func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.T si.err = ccr.Error // Let's retry as soon as possible, but we are gated by sourceConsumerRetryThreshold retry = true + mset.mu.Unlock() + return } else { + // Setup actual subscription to process messages from our source. + qname := fmt.Sprintf("[ACC:%s] stream source '%s' from '%s' msgs", mset.acc.Name, mset.cfg.Name, si.name) + // Create a new queue each time + si.msgs = newIPQueue[*inMsg](mset.srv, qname) + msgs := si.msgs + sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { + hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. + mset.queueInbound(msgs, subject, reply, hdr, msg) + }) + if err != nil { + si.err = NewJSSourceConsumerSetupFailedError(err, Unless(err)) + retry = true + mset.mu.Unlock() + return + } + // Save our sub. + si.sub = sub + if si.sseq != ccr.ConsumerInfo.Delivered.Stream { si.sseq = ccr.ConsumerInfo.Delivered.Stream + 1 } @@ -2991,7 +3056,7 @@ func (mset *stream) setSourceConsumer(iname string, seq uint64, startTime time.T mset.mu.Unlock() ready.Wait() case <-time.After(5 * time.Second): - mset.unsubscribeUnlocked(crSub) + mset.unsubscribe(crSub) // We already waited 5 seconds, let's retry now. retry = true } @@ -3281,18 +3346,20 @@ func streamAndSeq(shdr string) (string, string, uint64) { } // Lock should be held. -func (mset *stream) setStartingSequenceForSource(iName string, external *ExternalStream) { - si := mset.sources[iName] - if si == nil { - return - } - +func (mset *stream) setStartingSequenceForSources(iNames map[string]struct{}) { var state StreamState mset.store.FastState(&state) // Do not reset sseq here so we can remember when purge/expiration happens. if state.Msgs == 0 { - si.dseq = 0 + for iName := range iNames { + si := mset.sources[iName] + if si == nil { + continue + } else { + si.dseq = 0 + } + } return } @@ -3307,10 +3374,26 @@ func (mset *stream) setStartingSequenceForSource(iName string, external *Externa continue } streamName, indexName, sseq := streamAndSeq(string(ss)) - if indexName == si.iname || (indexName == _EMPTY_ && (streamName == si.name || (external != nil && streamName == si.name+":"+getHash(external.ApiPrefix)))) { + + if _, ok := iNames[indexName]; ok { + si := mset.sources[indexName] si.sseq = sseq si.dseq = 0 - return + delete(iNames, indexName) + } else if indexName == _EMPTY_ && streamName != _EMPTY_ { + for iName := range iNames { + // TODO streamSource is a linear walk, to optimize later + if si := mset.sources[iName]; si != nil && streamName == si.name || + (mset.streamSource(iName).External != nil && streamName == si.name+":"+getHash(mset.streamSource(iName).External.ApiPrefix)) { + si.sseq = sseq + si.dseq = 0 + delete(iNames, iName) + break + } + } + } + if len(iNames) == 0 { + break } } } @@ -3489,7 +3572,7 @@ func (mset *stream) subscribeToStream() error { mset.mirror.sfs = sfs mset.mirror.trs = trs // delay the actual mirror consumer creation for after a delay - mset.scheduleSetupMirrorConsumerRetryAsap() + mset.scheduleSetupMirrorConsumerRetry() } else if len(mset.cfg.Sources) > 0 { // Setup the initial source infos for the sources mset.resetSourceInfo() @@ -3620,55 +3703,43 @@ func (mset *stream) unsubscribeToStream(stopping bool) error { return nil } -// Lock should be held. +// Lock does NOT need to be held, we set the client on setup and never change it at this point. func (mset *stream) subscribeInternal(subject string, cb msgHandler) (*subscription, error) { - c := mset.client - if c == nil { - return nil, fmt.Errorf("invalid stream") + if mset.closed.Load() { + return nil, errStreamClosed } if cb == nil { - return nil, fmt.Errorf("undefined message handler") + return nil, errInvalidMsgHandler } - - mset.sid++ - + c := mset.client + sid := int(mset.sid.Add(1)) // Now create the subscription - return c.processSub([]byte(subject), nil, []byte(strconv.Itoa(mset.sid)), cb, false) + return c.processSub([]byte(subject), nil, []byte(strconv.Itoa(sid)), cb, false) } -// Helper for unlocked stream. -func (mset *stream) subscribeInternalUnlocked(subject string, cb msgHandler) (*subscription, error) { - mset.mu.Lock() - defer mset.mu.Unlock() - return mset.subscribeInternal(subject, cb) -} - -// Lock should be held. +// Lock does NOT need to be held, we set the client on setup and never change it at this point. func (mset *stream) queueSubscribeInternal(subject, group string, cb msgHandler) (*subscription, error) { - c := mset.client - if c == nil { - return nil, fmt.Errorf("invalid stream") + if mset.closed.Load() { + return nil, errStreamClosed } if cb == nil { - return nil, fmt.Errorf("undefined message handler") + return nil, errInvalidMsgHandler } - - mset.sid++ - + c := mset.client + sid := int(mset.sid.Add(1)) // Now create the subscription - return c.processSub([]byte(subject), []byte(group), []byte(strconv.Itoa(mset.sid)), cb, false) + return c.processSub([]byte(subject), []byte(group), []byte(strconv.Itoa(sid)), cb, false) } // This will unsubscribe us from the exact subject given. // We do not currently track the subs so do not have the sid. // This should be called only on an update. -// Lock should be held. +// Lock does NOT need to be held, we set the client on setup and never change it at this point. func (mset *stream) unsubscribeInternal(subject string) error { - c := mset.client - if c == nil { - return fmt.Errorf("invalid stream") + if mset.closed.Load() { + return errStreamClosed } - + c := mset.client var sid []byte c.mu.Lock() for _, sub := range c.subs { @@ -3687,18 +3758,12 @@ func (mset *stream) unsubscribeInternal(subject string) error { // Lock should be held. func (mset *stream) unsubscribe(sub *subscription) { - if sub == nil || mset.client == nil { + if sub == nil || mset.closed.Load() { return } mset.client.processUnsub(sub.sid) } -func (mset *stream) unsubscribeUnlocked(sub *subscription) { - mset.mu.Lock() - mset.unsubscribe(sub) - mset.mu.Unlock() -} - func (mset *stream) setupStore(fsCfg *FileStoreConfig) error { mset.mu.Lock() mset.created = time.Now().UTC() @@ -4078,20 +4143,21 @@ func (mset *stream) processInboundJetStreamMsg(_ *subscription, c *client, _ *Ac } var ( - errLastSeqMismatch = errors.New("last sequence mismatch") - errMsgIdDuplicate = errors.New("msgid is duplicate") - errStreamClosed = errors.New("stream closed") + errLastSeqMismatch = errors.New("last sequence mismatch") + errMsgIdDuplicate = errors.New("msgid is duplicate") + errStreamClosed = errors.New("stream closed") + errInvalidMsgHandler = errors.New("undefined message handler") ) // processJetStreamMsg is where we try to actually process the stream msg. func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, lseq uint64, ts int64) error { - mset.mu.Lock() - c, s, store := mset.client, mset.srv, mset.store - if mset.closed || c == nil { - mset.mu.Unlock() + if mset.closed.Load() { return errStreamClosed } + mset.mu.Lock() + s, store := mset.srv, mset.store + // Apply the input subject transform if any if mset.itr != nil { ts, err := mset.itr.Match(subject) @@ -4453,7 +4519,7 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, return err } - if exceeded, apiErr := jsa.limitsExceeded(stype, tierName); exceeded { + if exceeded, apiErr := jsa.limitsExceeded(stype, tierName, mset.cfg.Replicas); exceeded { s.RateLimitWarnf("JetStream resource limits exceeded for account: %q", accName) if canRespond { resp.PubAck = &PubAck{Stream: name} @@ -4911,8 +4977,9 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { jsa.mu.Unlock() // Mark as closed, kick monitor and collect consumers first. + mset.closed.Store(true) + mset.mu.Lock() - mset.closed = true // Signal to the monitor loop. // Can't use qch here. if mset.mqch != nil { @@ -5016,7 +5083,6 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { // Snapshot store. store := mset.store c := mset.client - mset.client = nil // Clustered cleanup. mset.mu.Unlock() @@ -5291,6 +5357,8 @@ func (mset *stream) Store() StreamStore { // Lock should be held. func (mset *stream) partitionUnique(name string, partitions []string) bool { for _, partition := range partitions { + psa := [32]string{} + pts := tokenizeSubjectIntoSlice(psa[:0], partition) for n, o := range mset.consumers { // Skip the consumer being checked. if n == name { @@ -5300,8 +5368,8 @@ func (mset *stream) partitionUnique(name string, partitions []string) bool { return false } for _, filter := range o.subjf { - if subjectIsSubsetMatch(partition, filter.subject) || - subjectIsSubsetMatch(filter.subject, partition) { + if isSubsetMatchTokenized(pts, filter.tokenizedSubject) || + isSubsetMatchTokenized(filter.tokenizedSubject, pts) { return false } } @@ -5459,7 +5527,7 @@ func (mset *stream) ackMsg(o *consumer, seq uint64) { // Don't make this RLock(). We need to have only 1 running at a time to gauge interest across all consumers. mset.mu.Lock() - if mset.closed || mset.store == nil || mset.cfg.Retention == LimitsPolicy { + if mset.closed.Load() || mset.cfg.Retention == LimitsPolicy { mset.mu.Unlock() return } @@ -5506,14 +5574,10 @@ func (mset *stream) ackMsg(o *consumer, seq uint64) { // Snapshot creates a snapshot for the stream and possibly consumers. func (mset *stream) snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) { - mset.mu.RLock() - if mset.client == nil || mset.store == nil { - mset.mu.RUnlock() - return nil, errors.New("invalid stream") + if mset.closed.Load() { + return nil, errStreamClosed } store := mset.store - mset.mu.RUnlock() - return store.Snapshot(deadline, checkMsgs, includeConsumers) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/subject_transform.go b/vendor/github.com/nats-io/nats-server/v2/server/subject_transform.go index 63876f43b8..e7d7257617 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/subject_transform.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/subject_transform.go @@ -561,3 +561,29 @@ func (tr *subjectTransform) reverse() *subjectTransform { rtr, _ := NewSubjectTransformStrict(nsrc, ndest) return rtr } + +// Will share relevant info regarding the subject. +// Returns valid, tokens, num pwcs, has fwc. +func subjectInfo(subject string) (bool, []string, int, bool) { + if subject == "" { + return false, nil, 0, false + } + npwcs := 0 + sfwc := false + tokens := strings.Split(subject, tsep) + for _, t := range tokens { + if len(t) == 0 || sfwc { + return false, nil, 0, false + } + if len(t) > 1 { + continue + } + switch t[0] { + case fwc: + sfwc = true + case pwc: + npwcs++ + } + } + return true, tokens, npwcs, sfwc +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/sublist.go b/vendor/github.com/nats-io/nats-server/v2/server/sublist.go index a78ff86c3c..fdd7cc5ffa 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/sublist.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/sublist.go @@ -19,6 +19,7 @@ import ( "strings" "sync" "sync/atomic" + "unicode/utf8" ) // Sublist is a routing mechanism to handle subject distribution and @@ -1075,9 +1076,22 @@ func IsValidPublishSubject(subject string) bool { // IsValidSubject returns true if a subject is valid, false otherwise func IsValidSubject(subject string) bool { + return isValidSubject(subject, false) +} + +func isValidSubject(subject string, checkRunes bool) bool { if subject == _EMPTY_ { return false } + if checkRunes { + // Since casting to a string will always produce valid UTF-8, we need to look for replacement runes. + // This signals something is off or corrupt. + for _, r := range subject { + if r == utf8.RuneError { + return false + } + } + } sfwc := false tokens := strings.Split(subject, tsep) for _, t := range tokens { @@ -1101,32 +1115,6 @@ func IsValidSubject(subject string) bool { return true } -// Will share relevant info regarding the subject. -// Returns valid, tokens, num pwcs, has fwc. -func subjectInfo(subject string) (bool, []string, int, bool) { - if subject == "" { - return false, nil, 0, false - } - npwcs := 0 - sfwc := false - tokens := strings.Split(subject, tsep) - for _, t := range tokens { - if len(t) == 0 || sfwc { - return false, nil, 0, false - } - if len(t) > 1 { - continue - } - switch t[0] { - case fwc: - sfwc = true - case pwc: - npwcs++ - } - } - return true, tokens, npwcs, sfwc -} - // IsValidLiteralSubject returns true if a subject is valid and literal (no wildcards), false otherwise func IsValidLiteralSubject(subject string) bool { return isValidLiteralSubject(strings.Split(subject, tsep)) diff --git a/vendor/github.com/nats-io/nkeys/README.md b/vendor/github.com/nats-io/nkeys/README.md index 37febc9a6f..17e3a8e3af 100644 --- a/vendor/github.com/nats-io/nkeys/README.md +++ b/vendor/github.com/nats-io/nkeys/README.md @@ -2,9 +2,9 @@ [![License Apache 2](https://img.shields.io/badge/License-Apache2-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Go Report Card](https://goreportcard.com/badge/github.com/nats-io/nkeys)](https://goreportcard.com/report/github.com/nats-io/nkeys) -[![Build Status](https://app.travis-ci.com/nats-io/nkeys.svg?branch=master)](https://app.travis-ci.com/nats-io/nkeys) +[![Build Status](https://github.com/nats-io/nkeys/actions/workflows/release.yaml/badge.svg)](https://github.com/nats-io/nkeys/actions/workflows/release.yaml/badge.svg) [![GoDoc](https://godoc.org/github.com/nats-io/nkeys?status.svg)](https://godoc.org/github.com/nats-io/nkeys) -[![Coverage Status](https://coveralls.io/repos/github/nats-io/nkeys/badge.svg?branch=master&service=github)](https://coveralls.io/github/nats-io/nkeys?branch=master) +[![Coverage Status](https://coveralls.io/repos/github/nats-io/nkeys/badge.svg?branch=main&service=github)](https://coveralls.io/github/nats-io/nkeys?branch=main) A public-key signature system based on [Ed25519](https://ed25519.cr.yp.to/) for the NATS ecosystem. @@ -66,4 +66,3 @@ user2, _ := nkeys.FromRawSeed(PrefixByteUser, rawSeed) Unless otherwise noted, the NATS source files are distributed under the Apache Version 2.0 license found in the LICENSE file. - diff --git a/vendor/github.com/nats-io/nkeys/nkeys.go b/vendor/github.com/nats-io/nkeys/nkeys.go index 0db0f0c1f0..6f1ba20509 100644 --- a/vendor/github.com/nats-io/nkeys/nkeys.go +++ b/vendor/github.com/nats-io/nkeys/nkeys.go @@ -19,7 +19,7 @@ package nkeys import "io" // Version is our current version -const Version = "0.4.6" +const Version = "0.4.7" // KeyPair provides the central interface to nkeys. type KeyPair interface { diff --git a/vendor/modules.txt b/vendor/modules.txt index a9ab5af5d5..785509a5f6 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1389,9 +1389,10 @@ github.com/mschoch/smat # github.com/nats-io/jwt/v2 v2.5.3 ## explicit; go 1.18 github.com/nats-io/jwt/v2 -# github.com/nats-io/nats-server/v2 v2.10.7 +# github.com/nats-io/nats-server/v2 v2.10.9 ## explicit; go 1.20 github.com/nats-io/nats-server/v2/conf +github.com/nats-io/nats-server/v2/internal/fastrand github.com/nats-io/nats-server/v2/internal/ldap github.com/nats-io/nats-server/v2/logger github.com/nats-io/nats-server/v2/server @@ -1406,8 +1407,8 @@ github.com/nats-io/nats.go github.com/nats-io/nats.go/encoders/builtin github.com/nats-io/nats.go/internal/parser github.com/nats-io/nats.go/util -# github.com/nats-io/nkeys v0.4.6 -## explicit; go 1.19 +# github.com/nats-io/nkeys v0.4.7 +## explicit; go 1.20 github.com/nats-io/nkeys # github.com/nats-io/nuid v1.0.1 ## explicit