diff --git a/go.mod b/go.mod index d1cc859ecf..0e974de4f6 100644 --- a/go.mod +++ b/go.mod @@ -57,7 +57,7 @@ require ( github.com/mitchellh/mapstructure v1.5.0 github.com/mna/pigeon v1.3.0 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 - github.com/nats-io/nats-server/v2 v2.10.22 + github.com/nats-io/nats-server/v2 v2.10.24 github.com/nats-io/nats.go v1.37.0 github.com/oklog/run v1.1.0 github.com/olekukonko/tablewriter v0.0.5 @@ -268,8 +268,8 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mschoch/smat v0.2.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/nats-io/jwt/v2 v2.5.8 // indirect - github.com/nats-io/nkeys v0.4.7 // indirect + github.com/nats-io/jwt/v2 v2.7.3 // indirect + github.com/nats-io/nkeys v0.4.9 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/nxadm/tail v1.4.8 // indirect github.com/opencontainers/runtime-spec v1.1.0 // indirect diff --git a/go.sum b/go.sum index afb74a817d..6ebd75f146 100644 --- a/go.sum +++ b/go.sum @@ -831,14 +831,14 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8m github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/namedotcom/go v0.0.0-20180403034216-08470befbe04/go.mod h1:5sN+Lt1CaY4wsPvgQH/jsuJi4XO2ssZbdsIizr4CVC8= -github.com/nats-io/jwt/v2 v2.5.8 h1:uvdSzwWiEGWGXf+0Q+70qv6AQdvcvxrv9hPM0RiPamE= -github.com/nats-io/jwt/v2 v2.5.8/go.mod h1:ZdWS1nZa6WMZfFwwgpEaqBV8EPGVgOTDHN/wTbz0Y5A= -github.com/nats-io/nats-server/v2 v2.10.22 h1:Yt63BGu2c3DdMoBZNcR6pjGQwk/asrKU7VX846ibxDA= -github.com/nats-io/nats-server/v2 v2.10.22/go.mod h1:X/m1ye9NYansUXYFrbcDwUi/blHkrgHh2rgCJaakonk= +github.com/nats-io/jwt/v2 v2.7.3 h1:6bNPK+FXgBeAqdj4cYQ0F8ViHRbi7woQLq4W29nUAzE= +github.com/nats-io/jwt/v2 v2.7.3/go.mod h1:GvkcbHhKquj3pkioy5put1wvPxs78UlZ7D/pY+BgZk4= +github.com/nats-io/nats-server/v2 v2.10.24 h1:KcqqQAD0ZZcG4yLxtvSFJY7CYKVYlnlWoAiVZ6i/IY4= +github.com/nats-io/nats-server/v2 v2.10.24/go.mod h1:olvKt8E5ZlnjyqBGbAXtxvSQKsPodISK5Eo/euIta4s= github.com/nats-io/nats.go v1.37.0 h1:07rauXbVnnJvv1gfIyghFEo6lUcYRY0WXc3x7x0vUxE= github.com/nats-io/nats.go v1.37.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= -github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI= -github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= +github.com/nats-io/nkeys v0.4.9 h1:qe9Faq2Gxwi6RZnZMXfmGMZkg3afLLOtrU+gDZJ35b0= +github.com/nats-io/nkeys v0.4.9/go.mod h1:jcMqs+FLG+W5YO36OX6wFIFcmpdAns+w1Wm6D3I/evE= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32/go.mod h1:9wM+0iRr9ahx58uYLpLIr5fm8diHn0JbqRycJi6w0Ms= diff --git a/vendor/github.com/nats-io/jwt/v2/account_claims.go b/vendor/github.com/nats-io/jwt/v2/account_claims.go index fa8fc5851e..9da374aed0 100644 --- a/vendor/github.com/nats-io/jwt/v2/account_claims.go +++ b/vendor/github.com/nats-io/jwt/v2/account_claims.go @@ -1,5 +1,5 @@ /* - * Copyright 2018-2023 The NATS Authors + * Copyright 2018-2024 The NATS Authors * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -133,7 +133,7 @@ func (o *OperatorLimits) Validate(vr *ValidationResults) { } } -// Mapping for publishes +// WeightedMapping for publishes type WeightedMapping struct { Subject Subject `json:"subject"` Weight uint8 `json:"weight,omitempty"` @@ -177,13 +177,13 @@ func (a *Account) AddMapping(sub Subject, to ...WeightedMapping) { a.Mappings[sub] = to } -// Enable external authorization for account users. +// ExternalAuthorization enables external authorization for account users. // AuthUsers are those users specified to bypass the authorization callout and should be used for the authorization service itself. // AllowedAccounts specifies which accounts, if any, that the authorization service can bind an authorized user to. // The authorization response, a user JWT, will still need to be signed by the correct account. // If optional XKey is specified, that is the public xkey (x25519) and the server will encrypt the request such that only the // holder of the private key can decrypt. The auth service can also optionally encrypt the response back to the server using it's -// publick xkey which will be in the authorization request. +// public xkey which will be in the authorization request. type ExternalAuthorization struct { AuthUsers StringList `json:"auth_users,omitempty"` AllowedAccounts StringList `json:"allowed_accounts,omitempty"` @@ -194,12 +194,12 @@ func (ac *ExternalAuthorization) IsEnabled() bool { return len(ac.AuthUsers) > 0 } -// Helper function to determine if external authorization is enabled. +// HasExternalAuthorization helper function to determine if external authorization is enabled. func (a *Account) HasExternalAuthorization() bool { return a.Authorization.IsEnabled() } -// Helper function to setup external authorization. +// EnableExternalAuthorization helper function to setup external authorization. func (a *Account) EnableExternalAuthorization(users ...string) { a.Authorization.AuthUsers.Add(users...) } @@ -230,6 +230,20 @@ func (ac *ExternalAuthorization) Validate(vr *ValidationResults) { } } +const ( + ClusterTrafficSystem = "system" + ClusterTrafficOwner = "owner" +) + +type ClusterTraffic string + +func (ct ClusterTraffic) Valid() error { + if ct == "" || ct == ClusterTrafficSystem || ct == ClusterTrafficOwner { + return nil + } + return fmt.Errorf("unknown cluster traffic option: %q", ct) +} + // Account holds account specific claims data type Account struct { Imports Imports `json:"imports,omitempty"` @@ -241,6 +255,7 @@ type Account struct { Mappings Mapping `json:"mappings,omitempty"` Authorization ExternalAuthorization `json:"authorization,omitempty"` Trace *MsgTrace `json:"trace,omitempty"` + ClusterTraffic ClusterTraffic `json:"cluster_traffic,omitempty"` Info GenericFields } @@ -308,6 +323,10 @@ func (a *Account) Validate(acct *AccountClaims, vr *ValidationResults) { } a.SigningKeys.Validate(vr) a.Info.Validate(vr) + + if err := a.ClusterTraffic.Valid(); err != nil { + vr.AddError(err.Error()) + } } // AccountClaims defines the body of an account JWT @@ -338,13 +357,17 @@ func NewAccountClaims(subject string) *AccountClaims { // Encode converts account claims into a JWT string func (a *AccountClaims) Encode(pair nkeys.KeyPair) (string, error) { + return a.EncodeWithSigner(pair, nil) +} + +func (a *AccountClaims) EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) { if !nkeys.IsValidPublicAccountKey(a.Subject) { return "", errors.New("expected subject to be account public key") } sort.Sort(a.Exports) sort.Sort(a.Imports) a.Type = AccountClaim - return a.ClaimsData.encode(pair, a) + return a.ClaimsData.encode(pair, a, fn) } // DecodeAccountClaims decodes account claims from a JWT string diff --git a/vendor/github.com/nats-io/jwt/v2/activation_claims.go b/vendor/github.com/nats-io/jwt/v2/activation_claims.go index 827658efc7..63fe788383 100644 --- a/vendor/github.com/nats-io/jwt/v2/activation_claims.go +++ b/vendor/github.com/nats-io/jwt/v2/activation_claims.go @@ -1,5 +1,5 @@ /* - * Copyright 2018 The NATS Authors + * Copyright 2018-2024 The NATS Authors * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -72,11 +72,15 @@ func NewActivationClaims(subject string) *ActivationClaims { // Encode turns an activation claim into a JWT strimg func (a *ActivationClaims) Encode(pair nkeys.KeyPair) (string, error) { + return a.EncodeWithSigner(pair, nil) +} + +func (a *ActivationClaims) EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) { if !nkeys.IsValidPublicAccountKey(a.ClaimsData.Subject) { return "", errors.New("expected subject to be an account") } a.Type = ActivationClaim - return a.ClaimsData.encode(pair, a) + return a.ClaimsData.encode(pair, a, fn) } // DecodeActivationClaims tries to create an activation claim from a JWT string diff --git a/vendor/github.com/nats-io/jwt/v2/authorization_claims.go b/vendor/github.com/nats-io/jwt/v2/authorization_claims.go index fccdcf222a..3448f11de6 100644 --- a/vendor/github.com/nats-io/jwt/v2/authorization_claims.go +++ b/vendor/github.com/nats-io/jwt/v2/authorization_claims.go @@ -1,5 +1,5 @@ /* - * Copyright 2022 The NATS Authors + * Copyright 2022-2024 The NATS Authors * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -113,8 +113,12 @@ func (ac *AuthorizationRequestClaims) Validate(vr *ValidationResults) { // Encode tries to turn the auth request claims into a JWT string. func (ac *AuthorizationRequestClaims) Encode(pair nkeys.KeyPair) (string, error) { + return ac.EncodeWithSigner(pair, nil) +} + +func (ac *AuthorizationRequestClaims) EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) { ac.Type = AuthorizationRequestClaim - return ac.ClaimsData.encode(pair, ac) + return ac.ClaimsData.encode(pair, ac, fn) } // DecodeAuthorizationRequestClaims tries to parse an auth request claims from a JWT string @@ -242,6 +246,10 @@ func (ar *AuthorizationResponseClaims) Validate(vr *ValidationResults) { // Encode tries to turn the auth request claims into a JWT string. func (ar *AuthorizationResponseClaims) Encode(pair nkeys.KeyPair) (string, error) { - ar.Type = AuthorizationResponseClaim - return ar.ClaimsData.encode(pair, ar) + return ar.EncodeWithSigner(pair, nil) +} + +func (ar *AuthorizationResponseClaims) EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) { + ar.Type = AuthorizationResponseClaim + return ar.ClaimsData.encode(pair, ar, fn) } diff --git a/vendor/github.com/nats-io/jwt/v2/claims.go b/vendor/github.com/nats-io/jwt/v2/claims.go index daac2d875e..9b816c34c1 100644 --- a/vendor/github.com/nats-io/jwt/v2/claims.go +++ b/vendor/github.com/nats-io/jwt/v2/claims.go @@ -1,5 +1,5 @@ /* - * Copyright 2018-2022 The NATS Authors + * Copyright 2018-2024 The NATS Authors * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -68,10 +68,16 @@ func IsGenericClaimType(s string) bool { } } +// SignFn is used in an external sign environment. The function should be +// able to locate the private key for the specified pub key specified and sign the +// specified data returning the signature as generated. +type SignFn func(pub string, data []byte) ([]byte, error) + // Claims is a JWT claims type Claims interface { Claims() *ClaimsData Encode(kp nkeys.KeyPair) (string, error) + EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) ExpectedPrefixes() []nkeys.PrefixByte Payload() interface{} String() string @@ -121,7 +127,7 @@ func serialize(v interface{}) (string, error) { return encodeToString(j), nil } -func (c *ClaimsData) doEncode(header *Header, kp nkeys.KeyPair, claim Claims) (string, error) { +func (c *ClaimsData) doEncode(header *Header, kp nkeys.KeyPair, claim Claims, fn SignFn) (string, error) { if header == nil { return "", errors.New("header is required") } @@ -200,9 +206,21 @@ func (c *ClaimsData) doEncode(header *Header, kp nkeys.KeyPair, claim Claims) (s if header.Algorithm == AlgorithmNkeyOld { return "", errors.New(AlgorithmNkeyOld + " not supported to write jwtV2") } else if header.Algorithm == AlgorithmNkey { - sig, err := kp.Sign([]byte(toSign)) - if err != nil { - return "", err + var sig []byte + if fn != nil { + pk, err := kp.PublicKey() + if err != nil { + return "", err + } + sig, err = fn(pk, []byte(toSign)) + if err != nil { + return "", err + } + } else { + sig, err = kp.Sign([]byte(toSign)) + if err != nil { + return "", err + } } eSig = encodeToString(sig) } else { @@ -224,8 +242,8 @@ func (c *ClaimsData) hash() (string, error) { // Encode encodes a claim into a JWT token. The claim is signed with the // provided nkey's private key -func (c *ClaimsData) encode(kp nkeys.KeyPair, payload Claims) (string, error) { - return c.doEncode(&Header{TokenTypeJwt, AlgorithmNkey}, kp, payload) +func (c *ClaimsData) encode(kp nkeys.KeyPair, payload Claims, fn SignFn) (string, error) { + return c.doEncode(&Header{TokenTypeJwt, AlgorithmNkey}, kp, payload, fn) } // Returns a JSON representation of the claim diff --git a/vendor/github.com/nats-io/jwt/v2/exports.go b/vendor/github.com/nats-io/jwt/v2/exports.go index 3ebc029337..0f26e84a08 100644 --- a/vendor/github.com/nats-io/jwt/v2/exports.go +++ b/vendor/github.com/nats-io/jwt/v2/exports.go @@ -273,7 +273,7 @@ func isContainedIn(kind ExportType, subjects []Subject, vr *ValidationResults) { } // Validate calls validate on all of the exports -func (e *Exports) Validate(vr *ValidationResults) error { +func (e *Exports) Validate(vr *ValidationResults) { var serviceSubjects []Subject var streamSubjects []Subject @@ -292,8 +292,6 @@ func (e *Exports) Validate(vr *ValidationResults) error { isContainedIn(Service, serviceSubjects, vr) isContainedIn(Stream, streamSubjects, vr) - - return nil } // HasExportContainingSubject checks if the export list has an export with the provided subject diff --git a/vendor/github.com/nats-io/jwt/v2/genericlaims.go b/vendor/github.com/nats-io/jwt/v2/genericlaims.go index 6793c9ea10..e680866f8a 100644 --- a/vendor/github.com/nats-io/jwt/v2/genericlaims.go +++ b/vendor/github.com/nats-io/jwt/v2/genericlaims.go @@ -1,5 +1,5 @@ /* - * Copyright 2018-2020 The NATS Authors + * Copyright 2018-2024 The NATS Authors * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -107,7 +107,11 @@ func (gc *GenericClaims) Payload() interface{} { // Encode takes a generic claims and creates a JWT string func (gc *GenericClaims) Encode(pair nkeys.KeyPair) (string, error) { - return gc.ClaimsData.encode(pair, gc) + return gc.ClaimsData.encode(pair, gc, nil) +} + +func (gc *GenericClaims) EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) { + return gc.ClaimsData.encode(pair, gc, fn) } // Validate checks the generic part of the claims data diff --git a/vendor/github.com/nats-io/jwt/v2/operator_claims.go b/vendor/github.com/nats-io/jwt/v2/operator_claims.go index 673225fa82..b5c9c94c32 100644 --- a/vendor/github.com/nats-io/jwt/v2/operator_claims.go +++ b/vendor/github.com/nats-io/jwt/v2/operator_claims.go @@ -1,5 +1,5 @@ /* - * Copyright 2018 The NATS Authors + * Copyright 2018-2024 The NATS Authors * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -191,6 +191,10 @@ func (oc *OperatorClaims) DidSign(op Claims) bool { // Encode the claims into a JWT string func (oc *OperatorClaims) Encode(pair nkeys.KeyPair) (string, error) { + return oc.EncodeWithSigner(pair, nil) +} + +func (oc *OperatorClaims) EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) { if !nkeys.IsValidPublicOperatorKey(oc.Subject) { return "", errors.New("expected subject to be an operator public key") } @@ -199,7 +203,7 @@ func (oc *OperatorClaims) Encode(pair nkeys.KeyPair) (string, error) { return "", err } oc.Type = OperatorClaim - return oc.ClaimsData.encode(pair, oc) + return oc.ClaimsData.encode(pair, oc, fn) } func (oc *OperatorClaims) ClaimType() ClaimType { diff --git a/vendor/github.com/nats-io/jwt/v2/types.go b/vendor/github.com/nats-io/jwt/v2/types.go index f0db549c5f..d5814db31e 100644 --- a/vendor/github.com/nats-io/jwt/v2/types.go +++ b/vendor/github.com/nats-io/jwt/v2/types.go @@ -309,7 +309,7 @@ func (l *Limits) Validate(vr *ValidationResults) { } } - if l.Times != nil && len(l.Times) > 0 { + if len(l.Times) > 0 { for _, t := range l.Times { t.Validate(vr) } diff --git a/vendor/github.com/nats-io/jwt/v2/user_claims.go b/vendor/github.com/nats-io/jwt/v2/user_claims.go index 53b781dbd4..294cc4b75a 100644 --- a/vendor/github.com/nats-io/jwt/v2/user_claims.go +++ b/vendor/github.com/nats-io/jwt/v2/user_claims.go @@ -1,5 +1,5 @@ /* - * Copyright 2018-2019 The NATS Authors + * Copyright 2018-2024 The NATS Authors * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -92,11 +92,15 @@ func (u *UserClaims) HasEmptyPermissions() bool { // Encode tries to turn the user claims into a JWT string func (u *UserClaims) Encode(pair nkeys.KeyPair) (string, error) { + return u.EncodeWithSigner(pair, nil) +} + +func (u *UserClaims) EncodeWithSigner(pair nkeys.KeyPair, fn SignFn) (string, error) { if !nkeys.IsValidPublicUserKey(u.Subject) { return "", errors.New("expected subject to be user public key") } u.Type = UserClaim - return u.ClaimsData.encode(pair, u) + return u.ClaimsData.encode(pair, u, fn) } // DecodeUserClaims tries to parse a user claims from a JWT string diff --git a/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore.go b/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore.go index 3d7dfde60f..110ea85a7d 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore.go @@ -46,11 +46,13 @@ type MatchByType int const ( matchByIssuer MatchByType = iota + 1 matchBySubject + matchByThumbprint ) var MatchByMap = map[string]MatchByType{ - "issuer": matchByIssuer, - "subject": matchBySubject, + "issuer": matchByIssuer, + "subject": matchBySubject, + "thumbprint": matchByThumbprint, } var Usage = ` diff --git a/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_other.go b/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_other.go index a72df834a1..459b8db64a 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_other.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_other.go @@ -1,4 +1,4 @@ -// Copyright 2022-2023 The NATS Authors +// Copyright 2022-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -26,8 +26,7 @@ var _ = MATCHBYEMPTY // otherKey implements crypto.Signer and crypto.Decrypter to satisfy linter on platforms that don't implement certstore type otherKey struct{} -func TLSConfig(certStore StoreType, certMatchBy MatchByType, certMatch string, config *tls.Config) error { - _, _, _, _ = certStore, certMatchBy, certMatch, config +func TLSConfig(_ StoreType, _ MatchByType, _ string, _ []string, _ bool, _ *tls.Config) error { return ErrOSNotCompatCertStore } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_windows.go b/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_windows.go index 19b9567be7..d47adb6eea 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_windows.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/certstore/certstore_windows.go @@ -1,4 +1,4 @@ -// Copyright 2022-2023 The NATS Authors +// Copyright 2022-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -41,26 +41,26 @@ import ( const ( // wincrypt.h constants - winAcquireCached = 0x1 // CRYPT_ACQUIRE_CACHE_FLAG - winAcquireSilent = 0x40 // CRYPT_ACQUIRE_SILENT_FLAG - winAcquireOnlyNCryptKey = 0x40000 // CRYPT_ACQUIRE_ONLY_NCRYPT_KEY_FLAG - winEncodingX509ASN = 1 // X509_ASN_ENCODING - winEncodingPKCS7 = 65536 // PKCS_7_ASN_ENCODING - winCertStoreProvSystem = 10 // CERT_STORE_PROV_SYSTEM - winCertStoreCurrentUser = uint32(winCertStoreCurrentUserID << winCompareShift) // CERT_SYSTEM_STORE_CURRENT_USER - winCertStoreLocalMachine = uint32(winCertStoreLocalMachineID << winCompareShift) // CERT_SYSTEM_STORE_LOCAL_MACHINE - winCertStoreCurrentUserID = 1 // CERT_SYSTEM_STORE_CURRENT_USER_ID - winCertStoreLocalMachineID = 2 // CERT_SYSTEM_STORE_LOCAL_MACHINE_ID - winInfoIssuerFlag = 4 // CERT_INFO_ISSUER_FLAG - winInfoSubjectFlag = 7 // CERT_INFO_SUBJECT_FLAG - winCompareNameStrW = 8 // CERT_COMPARE_NAME_STR_A - winCompareShift = 16 // CERT_COMPARE_SHIFT + winAcquireCached = windows.CRYPT_ACQUIRE_CACHE_FLAG + winAcquireSilent = windows.CRYPT_ACQUIRE_SILENT_FLAG + winAcquireOnlyNCryptKey = windows.CRYPT_ACQUIRE_ONLY_NCRYPT_KEY_FLAG + winEncodingX509ASN = windows.X509_ASN_ENCODING + winEncodingPKCS7 = windows.PKCS_7_ASN_ENCODING + winCertStoreProvSystem = windows.CERT_STORE_PROV_SYSTEM + winCertStoreCurrentUser = windows.CERT_SYSTEM_STORE_CURRENT_USER + winCertStoreLocalMachine = windows.CERT_SYSTEM_STORE_LOCAL_MACHINE + winCertStoreReadOnly = windows.CERT_STORE_READONLY_FLAG + winInfoIssuerFlag = windows.CERT_INFO_ISSUER_FLAG + winInfoSubjectFlag = windows.CERT_INFO_SUBJECT_FLAG + winCompareNameStrW = windows.CERT_COMPARE_NAME_STR_W + winCompareShift = windows.CERT_COMPARE_SHIFT // Reference https://learn.microsoft.com/en-us/windows/win32/api/wincrypt/nf-wincrypt-certfindcertificateinstore - winFindIssuerStr = winCompareNameStrW< 0 { + // Only reply subject permissions if the client is not already allowed to publish to the reply subject. + if client.replies != nil && len(reply) > 0 && !client.pubAllowedFullCheck(string(reply), true, true) { client.replies[string(reply)] = &resp{time.Now(), 0} - if len(client.replies) > replyPermLimit { + client.repliesSincePrune++ + if client.repliesSincePrune > replyPermLimit || time.Since(client.lastReplyPrune) > replyPruneTime { client.pruneReplyPerms() } } @@ -3652,6 +3658,9 @@ func (c *client) pruneReplyPerms() { delete(c.replies, k) } } + + c.repliesSincePrune = 0 + c.lastReplyPrune = now } // pruneDenyCache will prune the deny cache via randomly @@ -3720,7 +3729,7 @@ func (c *client) pubAllowedFullCheck(subject string, fullCheck, hasLock bool) bo allowed = np == 0 } - // If we are currently not allowed but we are tracking reply subjects + // If we are tracking reply subjects // dynamically, check to see if we are allowed here but avoid pcache. // We need to acquire the lock though. if !allowed && fullCheck && c.perms.resp != nil { @@ -4570,6 +4579,21 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, // Declared here because of goto. var queues [][]byte + var leafOrigin string + switch c.kind { + case ROUTER: + if len(c.pa.origin) > 0 { + // Picture a message sent from a leafnode to a server that then routes + // this message: CluserA -leaf-> HUB1 -route-> HUB2 + // Here we are in HUB2, so c.kind is a ROUTER, but the message will + // contain a c.pa.origin set to "ClusterA" to indicate that this message + // originated from that leafnode cluster. + leafOrigin = bytesToString(c.pa.origin) + } + case LEAF: + leafOrigin = c.remoteCluster() + } + // For all routes/leaf/gateway connections, we may still want to send messages to // leaf nodes or routes even if there are no queue filters since we collect // them above and do not process inline like normal clients. @@ -4608,12 +4632,24 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, ql := _ql[:0] for i := 0; i < len(qsubs); i++ { sub = qsubs[i] - if sub.client.kind == LEAF || sub.client.kind == ROUTER { - // If we have assigned an rsub already, replace if the destination is a LEAF - // since we want to favor that compared to a ROUTER. We could make sure that - // we override only if previous was a ROUTE and not a LEAF, but we don't have to. - if rsub == nil || sub.client.kind == LEAF { + if dst := sub.client.kind; dst == LEAF || dst == ROUTER { + // If the destination is a LEAF, we first need to make sure + // that we would not pick one that was the origin of this + // message. + if dst == LEAF && leafOrigin != _EMPTY_ && leafOrigin == sub.client.remoteCluster() { + continue + } + // If we have assigned a ROUTER rsub already, replace if + // the destination is a LEAF since we want to favor that. + if rsub == nil || (rsub.client.kind == ROUTER && dst == LEAF) { rsub = sub + } else if dst == LEAF { + // We already have a LEAF and this is another one. + // Flip a coin to see if we swap it or not. + // See https://github.com/nats-io/nats-server/issues/6040 + if fastrand.Uint32()%2 == 1 { + rsub = sub + } } } else { ql = append(ql, sub) @@ -4629,6 +4665,8 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, } // Find a subscription that is able to deliver this message starting at a random index. + // Note that if the message came from a ROUTER, we will only have CLIENT or LEAF + // queue subs here, otherwise we can have all types. for i := 0; i < lqs; i++ { if sindex+i < lqs { sub = qsubs[sindex+i] @@ -4649,20 +4687,38 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, // Here we just care about a client or leaf and skipping a leaf and preferring locals. if dst := sub.client.kind; dst == ROUTER || dst == LEAF { if (src == LEAF || src == CLIENT) && dst == LEAF { + // If we come from a LEAF and are about to pick a LEAF connection, + // make sure this is not the same leaf cluster. + if src == LEAF && leafOrigin != _EMPTY_ && leafOrigin == sub.client.remoteCluster() { + continue + } // Remember that leaf in case we don't find any other candidate. + // We already start randomly in lqs slice, so we don't need + // to do a random swap if we already have an rsub like we do + // when src == ROUTER above. if rsub == nil { rsub = sub } continue } else { - // We would be picking a route, but if we had remembered a "hub" leaf, - // then pick that one instead of the route. - if rsub != nil && rsub.client.kind == LEAF && rsub.client.isHubLeafNode() { - break + // We want to favor qsubs in our own cluster. If the routed + // qsub has an origin, it means that is on behalf of a leaf. + // We need to treat it differently. + if len(sub.origin) > 0 { + // If we already have an rsub, nothing to do. Also, do + // not pick a routed qsub for a LEAF origin cluster + // that is the same than where the message comes from. + if rsub == nil && (leafOrigin == _EMPTY_ || leafOrigin != bytesToString(sub.origin)) { + rsub = sub + } + continue } + // This is a qsub that is local on the remote server (or + // we are connected to an older server and we don't know). + // Pick this one and be done. rsub = sub + break } - break } // Assume delivery subject is normal subject to this point. @@ -4749,18 +4805,11 @@ sendToRoutesOrLeafs: // If so make sure we do not send it back to the same cluster for a different // leafnode. Cluster wide no echo. if dc.kind == LEAF { - // Check two scenarios. One is inbound from a route (c.pa.origin) - if c.kind == ROUTER && len(c.pa.origin) > 0 { - if bytesToString(c.pa.origin) == dc.remoteCluster() { - continue - } - } - // The other is leaf to leaf. - if c.kind == LEAF { - src, dest := c.remoteCluster(), dc.remoteCluster() - if src != _EMPTY_ && src == dest { - continue - } + // Check two scenarios. One is inbound from a route (c.pa.origin), + // and the other is leaf to leaf. In both case, leafOrigin is the one + // to use for the comparison. + if leafOrigin != _EMPTY_ && leafOrigin == dc.remoteCluster() { + continue } // We need to check if this is a request that has a stamped client information header. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/const.go b/vendor/github.com/nats-io/nats-server/v2/server/const.go index 69bad3f308..95f19ca2c7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/const.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/const.go @@ -55,7 +55,7 @@ func init() { const ( // VERSION is the current version for the server. - VERSION = "2.10.22" + VERSION = "2.10.24" // PROTO is the currently supported protocol. // 0 was the original @@ -171,6 +171,9 @@ const ( // MAX_HPUB_ARGS Maximum possible number of arguments from HPUB proto. MAX_HPUB_ARGS = 4 + // MAX_RSUB_ARGS Maximum possible number of arguments from a RS+/LS+ proto. + MAX_RSUB_ARGS = 6 + // DEFAULT_MAX_CLOSED_CLIENTS is the maximum number of closed connections we hold onto. DEFAULT_MAX_CLOSED_CLIENTS = 10000 diff --git a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go index 849fb1c536..438041ec89 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go @@ -345,6 +345,7 @@ type consumer struct { outq *jsOutQ pending map[uint64]*Pending ptmr *time.Timer + ptmrEnd time.Time rdq []uint64 rdqi avl.SequenceSet rdc map[uint64]uint64 @@ -504,7 +505,7 @@ func checkConsumerCfg( } // Check if we have a BackOff defined that MaxDeliver is within range etc. - if lbo := len(config.BackOff); lbo > 0 && config.MaxDeliver != -1 && config.MaxDeliver <= lbo { + if lbo := len(config.BackOff); lbo > 0 && config.MaxDeliver != -1 && lbo > config.MaxDeliver { return NewJSConsumerMaxDeliverBackoffError() } @@ -950,7 +951,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri // If we have multiple filter subjects, create a sublist which we will use // in calling store.LoadNextMsgMulti. if len(o.cfg.FilterSubjects) > 0 { - o.filters = NewSublistWithCache() + o.filters = NewSublistNoCache() for _, filter := range o.cfg.FilterSubjects { o.filters.Insert(&subscription{subject: []byte(filter)}) } @@ -1349,7 +1350,7 @@ func (o *consumer) setLeader(isLeader bool) { stopAndClearTimer(&o.dtmr) // Make sure to clear out any re-deliver queues - stopAndClearTimer(&o.ptmr) + o.stopAndClearPtmr() o.rdq = nil o.rdqi.Empty() o.pending = nil @@ -1562,6 +1563,16 @@ func (o *consumer) updateDeliveryInterest(localInterest bool) bool { return false } +const ( + defaultConsumerNotActiveStartInterval = 30 * time.Second + defaultConsumerNotActiveMaxInterval = 5 * time.Minute +) + +var ( + consumerNotActiveStartInterval = defaultConsumerNotActiveStartInterval + consumerNotActiveMaxInterval = defaultConsumerNotActiveMaxInterval +) + func (o *consumer) deleteNotActive() { o.mu.Lock() if o.mset == nil { @@ -1627,12 +1638,8 @@ func (o *consumer) deleteNotActive() { // Check to make sure we went away. // Don't think this needs to be a monitored go routine. go func() { - const ( - startInterval = 30 * time.Second - maxInterval = 5 * time.Minute - ) - jitter := time.Duration(rand.Int63n(int64(startInterval))) - interval := startInterval + jitter + jitter := time.Duration(rand.Int63n(int64(consumerNotActiveStartInterval))) + interval := consumerNotActiveStartInterval + jitter ticker := time.NewTicker(interval) defer ticker.Stop() for range ticker.C { @@ -1647,7 +1654,7 @@ func (o *consumer) deleteNotActive() { if nca != nil && nca == ca { s.Warnf("Consumer assignment for '%s > %s > %s' not cleaned up, retrying", acc, stream, name) meta.ForwardProposal(removeEntry) - if interval < maxInterval { + if interval < consumerNotActiveMaxInterval { interval *= 2 ticker.Reset(interval) } @@ -1739,7 +1746,7 @@ func (o *consumer) forceExpirePending() { p.Timestamp += off } } - o.ptmr.Reset(o.ackWait(0)) + o.resetPtmr(o.ackWait(0)) } o.signalNewMessages() } @@ -1842,7 +1849,7 @@ func (acc *Account) checkNewConsumerConfig(cfg, ncfg *ConsumerConfig) error { } // Check if BackOff is defined, MaxDeliver is within range. - if lbo := len(ncfg.BackOff); lbo > 0 && ncfg.MaxDeliver != -1 && ncfg.MaxDeliver <= lbo { + if lbo := len(ncfg.BackOff); lbo > 0 && ncfg.MaxDeliver != -1 && lbo > ncfg.MaxDeliver { return NewJSConsumerMaxDeliverBackoffError() } @@ -1882,7 +1889,7 @@ func (o *consumer) updateConfig(cfg *ConsumerConfig) error { // AckWait if cfg.AckWait != o.cfg.AckWait { if o.ptmr != nil { - o.ptmr.Reset(100 * time.Millisecond) + o.resetPtmr(100 * time.Millisecond) } } // Rate Limit @@ -1940,7 +1947,7 @@ func (o *consumer) updateConfig(cfg *ConsumerConfig) error { if len(o.subjf) == 1 { o.filters = nil } else { - o.filters = NewSublistWithCache() + o.filters = NewSublistNoCache() for _, filter := range o.subjf { o.filters.Insert(&subscription{subject: []byte(filter.subject)}) } @@ -2205,9 +2212,7 @@ func (o *consumer) updateDelivered(dseq, sseq, dc uint64, ts int64) { n += binary.PutUvarint(b[n:], dc) n += binary.PutVarint(b[n:], ts) o.propose(b[:n]) - } - if o.store != nil { - // Update local state always. + } else if o.store != nil { o.store.UpdateDelivered(dseq, sseq, dc, ts) } // Update activity. @@ -2413,7 +2418,7 @@ func (o *consumer) processNak(sseq, dseq, dc uint64, nak []byte) { if o.ptmr != nil { // Want checkPending to run and figure out the next timer ttl. // TODO(dlc) - We could optimize this maybe a bit more and track when we expect the timer to fire. - o.ptmr.Reset(10 * time.Millisecond) + o.resetPtmr(10 * time.Millisecond) } } // Nothing else for use to do now so return. @@ -2547,11 +2552,7 @@ func (o *consumer) applyState(state *ConsumerState) { if o.cfg.AckWait < delay { delay = o.ackWait(0) } - if o.ptmr == nil { - o.ptmr = time.AfterFunc(delay, o.checkPending) - } else { - o.ptmr.Reset(delay) - } + o.resetPtmr(delay) } } @@ -2666,23 +2667,20 @@ func (o *consumer) infoWithSnapAndReply(snap bool, reply string) *ConsumerInfo { TimeStamp: time.Now().UTC(), } - // If we are replicated and we are not the leader or we are filtered, we need to pull certain data from our store. - isLeader := o.isLeader() - if rg != nil && rg.node != nil && o.store != nil && (!isLeader || o.isFiltered()) { + // If we are replicated, we need to pull certain data from our store. + if rg != nil && rg.node != nil && o.store != nil { state, err := o.store.BorrowState() if err != nil { o.mu.Unlock() return nil } - if !isLeader { - info.Delivered.Consumer, info.Delivered.Stream = state.Delivered.Consumer, state.Delivered.Stream - info.AckFloor.Consumer, info.AckFloor.Stream = state.AckFloor.Consumer, state.AckFloor.Stream + // If we are the leader we could have o.sseq that is skipped ahead. + // To maintain consistency in reporting (e.g. jsz) we always take the state for our delivered/ackfloor stream sequence. + info.Delivered.Consumer, info.Delivered.Stream = state.Delivered.Consumer, state.Delivered.Stream + info.AckFloor.Consumer, info.AckFloor.Stream = state.AckFloor.Consumer, state.AckFloor.Stream + if !o.isLeader() { info.NumAckPending = len(state.Pending) info.NumRedelivered = len(state.Redelivered) - } else { - // Since we are filtered and we are the leader we could have o.sseq that is skipped ahead. - // To maintain consistency in reporting (e.g. jsz) we take the state for our delivered stream sequence. - info.Delivered.Stream = state.Delivered.Stream } } @@ -2786,18 +2784,30 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b return false } - // Check if this ack is above the current pointer to our next to deliver. - // This could happen on a cooperative takeover with high speed deliveries. - if sseq >= o.sseq { - o.sseq = sseq + 1 - } - mset := o.mset if mset == nil || mset.closed.Load() { o.mu.Unlock() return false } + // Check if this ack is above the current pointer to our next to deliver. + // This could happen on a cooperative takeover with high speed deliveries. + if sseq >= o.sseq { + // Let's make sure this is valid. + // This is only received on the consumer leader, so should never be higher + // than the last stream sequence. + var ss StreamState + mset.store.FastState(&ss) + if sseq > ss.LastSeq { + o.srv.Warnf("JetStream consumer '%s > %s > %s' ACK sequence %d past last stream sequence of %d", + o.acc.Name, o.stream, o.name, sseq, ss.LastSeq) + // FIXME(dlc) - For 2.11 onwards should we return an error here to the caller? + o.mu.Unlock() + return false + } + o.sseq = sseq + 1 + } + // Let the owning stream know if we are interest or workqueue retention based. // If this consumer is clustered (o.node != nil) this will be handled by // processReplicatedAck after the ack has propagated. @@ -3011,6 +3021,14 @@ func (o *consumer) needAck(sseq uint64, subj string) bool { return needAck } +// Used in nextReqFromMsg, since the json.Unmarshal causes the request +// struct to escape to the heap always. This should reduce GC pressure. +var jsGetNextPool = sync.Pool{ + New: func() any { + return &JSApiConsumerGetNextRequest{} + }, +} + // Helper for the next message requests. func nextReqFromMsg(msg []byte) (time.Time, int, int, bool, time.Duration, time.Time, error) { req := bytes.TrimSpace(msg) @@ -3020,7 +3038,11 @@ func nextReqFromMsg(msg []byte) (time.Time, int, int, bool, time.Duration, time. return time.Time{}, 1, 0, false, 0, time.Time{}, nil case req[0] == '{': - var cr JSApiConsumerGetNextRequest + cr := jsGetNextPool.Get().(*JSApiConsumerGetNextRequest) + defer func() { + *cr = JSApiConsumerGetNextRequest{} + jsGetNextPool.Put(cr) + }() if err := json.Unmarshal(req, &cr); err != nil { return time.Time{}, -1, 0, false, 0, time.Time{}, err } @@ -3420,6 +3442,7 @@ func (o *consumer) processNextMsgRequest(reply string, msg []byte) { if err := o.waiting.add(wr); err != nil { sendErr(409, "Exceeded MaxWaiting") + wr.recycle() return } o.signalNewMessages() @@ -3625,7 +3648,7 @@ func (o *consumer) getNextMsg() (*jsPubMsg, uint64, error) { // Check if we are multi-filtered or not. if filters != nil { sm, sseq, err = store.LoadNextMsgMulti(filters, fseq, &pmsg.StoreMsg) - } else if subjf != nil { // Means single filtered subject since o.filters means > 1. + } else if len(subjf) > 0 { // Means single filtered subject since o.filters means > 1. filter, wc := subjf[0].subject, subjf[0].hasWildcard sm, sseq, err = store.LoadNextMsg(filter, wc, fseq, &pmsg.StoreMsg) } else { @@ -3817,7 +3840,7 @@ func (o *consumer) checkAckFloor() { // We will set it explicitly to 1 behind our current lowest in pending, or if // pending is empty, to our current delivered -1. const minOffThreshold = 50 - if o.asflr < ss.FirstSeq-minOffThreshold { + if ss.FirstSeq >= minOffThreshold && o.asflr < ss.FirstSeq-minOffThreshold { var psseq, pdseq uint64 for seq, p := range o.pending { if psseq == 0 || seq < psseq { @@ -4270,37 +4293,15 @@ func (o *consumer) calculateNumPending() (npc, npf uint64) { } isLastPerSubject := o.cfg.DeliverPolicy == DeliverLastPerSubject + filters, subjf := o.filters, o.subjf - // Deliver Last Per Subject calculates num pending differently. - if isLastPerSubject { - // Consumer without filters. - if o.subjf == nil { - return o.mset.store.NumPending(o.sseq, _EMPTY_, isLastPerSubject) - } - // Consumer with filters. - for _, filter := range o.subjf { - lnpc, lnpf := o.mset.store.NumPending(o.sseq, filter.subject, isLastPerSubject) - npc += lnpc - if lnpf > npf { - npf = lnpf // Always last - } - } - return npc, npf + if filters != nil { + return o.mset.store.NumPendingMulti(o.sseq, filters, isLastPerSubject) + } else if len(subjf) > 0 { + filter := subjf[0].subject + return o.mset.store.NumPending(o.sseq, filter, isLastPerSubject) } - // Every other Delivery Policy is handled here. - // Consumer without filters. - if o.subjf == nil { - return o.mset.store.NumPending(o.sseq, _EMPTY_, false) - } - // Consumer with filters. - for _, filter := range o.subjf { - lnpc, lnpf := o.mset.store.NumPending(o.sseq, filter.subject, false) - npc += lnpc - if lnpf > npf { - npf = lnpf // Always last - } - } - return npc, npf + return o.mset.store.NumPending(o.sseq, _EMPTY_, isLastPerSubject) } func convertToHeadersOnly(pmsg *jsPubMsg) { @@ -4465,9 +4466,24 @@ func (o *consumer) trackPending(sseq, dseq uint64) { if o.pending == nil { o.pending = make(map[uint64]*Pending) } - if o.ptmr == nil { - o.ptmr = time.AfterFunc(o.ackWait(0), o.checkPending) + + // We could have a backoff that set a timer higher than what we need for this message. + // In that case, reset to lowest backoff required for a message redelivery. + minDelay := o.ackWait(0) + if l := len(o.cfg.BackOff); l > 0 { + bi := int(o.rdc[sseq]) + if bi < 0 { + bi = 0 + } else if bi >= l { + bi = l - 1 + } + minDelay = o.ackWait(o.cfg.BackOff[bi]) } + minDeadline := time.Now().Add(minDelay) + if o.ptmr == nil || o.ptmrEnd.After(minDeadline) { + o.resetPtmr(minDelay) + } + if p, ok := o.pending[sseq]; ok { // Update timestamp but keep original consumer delivery sequence. // So do not update p.Sequence. @@ -4590,24 +4606,21 @@ func (o *consumer) removeFromRedeliverQueue(seq uint64) bool { // Checks the pending messages. func (o *consumer) checkPending() { - o.mu.RLock() + o.mu.Lock() + defer o.mu.Unlock() + mset := o.mset // On stop, mset and timer will be nil. if o.closed || mset == nil || o.ptmr == nil { - stopAndClearTimer(&o.ptmr) - o.mu.RUnlock() + o.stopAndClearPtmr() return } - o.mu.RUnlock() var shouldUpdateState bool var state StreamState mset.store.FastState(&state) fseq := state.FirstSeq - o.mu.Lock() - defer o.mu.Unlock() - now := time.Now().UnixNano() ttl := int64(o.cfg.AckWait) next := int64(o.ackWait(0)) @@ -4623,11 +4636,7 @@ func (o *consumer) checkPending() { check := len(o.pending) > 1024 for seq, p := range o.pending { if check && atomic.LoadInt64(&o.awl) > 0 { - if o.ptmr == nil { - o.ptmr = time.AfterFunc(100*time.Millisecond, o.checkPending) - } else { - o.ptmr.Reset(100 * time.Millisecond) - } + o.resetPtmr(100 * time.Millisecond) return } // Check if these are no longer valid. @@ -4694,15 +4703,10 @@ func (o *consumer) checkPending() { } if len(o.pending) > 0 { - delay := time.Duration(next) - if o.ptmr == nil { - o.ptmr = time.AfterFunc(delay, o.checkPending) - } else { - o.ptmr.Reset(o.ackWait(delay)) - } + o.resetPtmr(time.Duration(next)) } else { // Make sure to stop timer and clear out any re delivery queues - stopAndClearTimer(&o.ptmr) + o.stopAndClearPtmr() o.rdq = nil o.rdqi.Empty() o.pending = nil @@ -4890,7 +4894,7 @@ func (o *consumer) selectStartingSeqNo() { for _, filter := range o.subjf { // Use first sequence since this is more optimized atm. ss := o.mset.store.FilteredState(state.FirstSeq, filter.subject) - if ss.First > o.sseq && ss.First < nseq { + if ss.First >= o.sseq && ss.First < nseq { nseq = ss.First } } @@ -5188,7 +5192,7 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error { o.client = nil sysc := o.sysc o.sysc = nil - stopAndClearTimer(&o.ptmr) + o.stopAndClearPtmr() stopAndClearTimer(&o.dtmr) stopAndClearTimer(&o.gwdtmr) delivery := o.cfg.DeliverSubject @@ -5242,12 +5246,6 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error { if dflag { n.Delete() } else { - // Try to install snapshot on clean exit - if o.store != nil && (o.retention != LimitsPolicy || n.NeedSnapshot()) { - if snap, err := o.store.EncodedState(); err == nil { - n.InstallSnapshot(snap) - } - } n.Stop() } } @@ -5329,12 +5327,14 @@ func (o *consumer) cleanupNoInterestMessages(mset *stream, ignoreInterest bool) return } + mset.mu.RUnlock() + mset.mu.Lock() for seq := start; seq <= stop; seq++ { if mset.noInterest(seq, co) { rmseqs = append(rmseqs, seq) } } - mset.mu.RUnlock() + mset.mu.Unlock() // These can be removed. for _, seq := range rmseqs { @@ -5590,8 +5590,9 @@ func (o *consumer) checkStateForInterestStream(ss *StreamState) error { o.mu.Lock() // Update our check floor. - if seq > o.chkflr { - o.chkflr = seq + // Check floor must never be greater than ack floor+1, otherwise subsequent calls to this function would skip work. + if asflr+1 > o.chkflr { + o.chkflr = asflr + 1 } // See if we need to process this update if our parent stream is not a limits policy stream. state, _ = o.store.State() @@ -5610,3 +5611,17 @@ func (o *consumer) checkStateForInterestStream(ss *StreamState) error { } return nil } + +func (o *consumer) resetPtmr(delay time.Duration) { + if o.ptmr == nil { + o.ptmr = time.AfterFunc(delay, o.checkPending) + } else { + o.ptmr.Reset(delay) + } + o.ptmrEnd = time.Now().Add(delay) +} + +func (o *consumer) stopAndClearPtmr() { + stopAndClearTimer(&o.ptmr) + o.ptmrEnd = time.Time{} +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/events.go b/vendor/github.com/nats-io/nats-server/v2/server/events.go index 3f8ef05014..7c891b423d 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/events.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/events.go @@ -315,6 +315,15 @@ type ClientInfo struct { Nonce string `json:"nonce,omitempty"` } +// forAssignmentSnap returns the minimum amount of ClientInfo we need for assignment snapshots. +func (ci *ClientInfo) forAssignmentSnap() *ClientInfo { + return &ClientInfo{ + Account: ci.Account, + Service: ci.Service, + Cluster: ci.Cluster, + } +} + // ServerStats hold various statistics that we will periodically send out. type ServerStats struct { Start time.Time `json:"start"` @@ -1938,7 +1947,9 @@ type ServerAPIResponse struct { compress compressionType } -// Specialized response types for unmarshalling. +// Specialized response types for unmarshalling. These structures are not +// used in the server code and only there for users of the Z endpoints to +// unmarshal the data without having to create these structs in their code // ServerAPIConnzResponse is the response type connz type ServerAPIConnzResponse struct { @@ -1947,6 +1958,69 @@ type ServerAPIConnzResponse struct { Error *ApiError `json:"error,omitempty"` } +// ServerAPIRoutezResponse is the response type for routez +type ServerAPIRoutezResponse struct { + Server *ServerInfo `json:"server"` + Data *Routez `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPIGatewayzResponse is the response type for gatewayz +type ServerAPIGatewayzResponse struct { + Server *ServerInfo `json:"server"` + Data *Gatewayz `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPIJszResponse is the response type for jsz +type ServerAPIJszResponse struct { + Server *ServerInfo `json:"server"` + Data *JSInfo `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPIHealthzResponse is the response type for healthz +type ServerAPIHealthzResponse struct { + Server *ServerInfo `json:"server"` + Data *HealthStatus `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPIVarzResponse is the response type for varz +type ServerAPIVarzResponse struct { + Server *ServerInfo `json:"server"` + Data *Varz `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPISubszResponse is the response type for subsz +type ServerAPISubszResponse struct { + Server *ServerInfo `json:"server"` + Data *Subsz `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPILeafzResponse is the response type for leafz +type ServerAPILeafzResponse struct { + Server *ServerInfo `json:"server"` + Data *Leafz `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPIAccountzResponse is the response type for accountz +type ServerAPIAccountzResponse struct { + Server *ServerInfo `json:"server"` + Data *Accountz `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + +// ServerAPIExpvarzResponse is the response type for expvarz +type ServerAPIExpvarzResponse struct { + Server *ServerInfo `json:"server"` + Data *ExpvarzStatus `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + // statszReq is a request for us to respond with current statsz. func (s *Server) statszReq(sub *subscription, c *client, _ *Account, subject, reply string, hdr, msg []byte) { if !s.EventsEnabled() { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go index ec66ad28f2..c5920587da 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go @@ -29,6 +29,7 @@ import ( "io" "io/fs" "math" + mrand "math/rand" "net" "os" "path/filepath" @@ -579,6 +580,9 @@ func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { if cfg.Storage != FileStorage { return fmt.Errorf("fileStore requires file storage type in config") } + if cfg.MaxMsgsPer < -1 { + cfg.MaxMsgsPer = -1 + } fs.mu.Lock() new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg} @@ -609,7 +613,7 @@ func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { fs.ageChk = nil } - if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer { + if fs.cfg.MaxMsgsPer > 0 && (old_cfg.MaxMsgsPer == 0 || fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer) { fs.enforceMsgPerSubjectLimit(true) } fs.mu.Unlock() @@ -1739,6 +1743,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { var matched bool mb := fs.lmb if mb == nil || mb.index != blkIndex { + os.Remove(fn) fs.warn("Stream state block does not exist or index mismatch") return errCorruptState } @@ -1777,6 +1782,14 @@ func (fs *fileStore) recoverFullState() (rerr error) { } } + // We check first and last seq and number of msgs and bytes. If there is a difference, + // return and error so we rebuild from the message block state on disk. + if !trackingStatesEqual(&fs.state, &mstate) { + os.Remove(fn) + fs.warn("Stream state encountered internal inconsistency on recover") + return errCorruptState + } + return nil } @@ -2809,7 +2822,9 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) _tsa, _fsa := [32]string{}, [32]string{} tsa, fsa := _tsa[:0], _fsa[:0] - fsa = tokenizeSubjectIntoSlice(fsa[:0], filter) + if wc { + fsa = tokenizeSubjectIntoSlice(fsa[:0], filter) + } isMatch := func(subj string) bool { if isAll { @@ -2903,7 +2918,6 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) mb := fs.blks[i] // Hold write lock in case we need to load cache. mb.mu.Lock() - var t uint64 if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) { total += mb.msgs mb.mu.Unlock() @@ -2918,6 +2932,7 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) // Mark fss activity. mb.lsts = time.Now().UnixNano() + var t uint64 var havePartial bool mb.fss.Match(stringToBytes(filter), func(bsubj []byte, ss *SimpleState) { if havePartial { @@ -2945,8 +2960,12 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) } // Clear on partial. t = 0 + start := sseq + if fseq := atomic.LoadUint64(&mb.first.seq); fseq > start { + start = fseq + } var smv StoreMsg - for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { + for seq, lseq := start, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && isMatch(sm.subj) { t++ } @@ -3051,6 +3070,296 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) return total, validThrough } +// NumPending will return the number of pending messages matching any subject in the sublist starting at sequence. +// Optimized for stream num pending calculations for consumers with lots of filtered subjects. +// Subjects should not overlap, this property is held when doing multi-filtered consumers. +func (fs *fileStore) NumPendingMulti(sseq uint64, sl *Sublist, lastPerSubject bool) (total, validThrough uint64) { + fs.mu.RLock() + defer fs.mu.RUnlock() + + // This can always be last for these purposes. + validThrough = fs.state.LastSeq + + if fs.state.Msgs == 0 || sseq > fs.state.LastSeq { + return 0, validThrough + } + + // If sseq is less then our first set to first. + if sseq < fs.state.FirstSeq { + sseq = fs.state.FirstSeq + } + // Track starting for both block for the sseq and staring block that matches any subject. + var seqStart int + // See if we need to figure out starting block per sseq. + if sseq > fs.state.FirstSeq { + // This should not, but can return -1, so make sure we check to avoid panic below. + if seqStart, _ = fs.selectMsgBlockWithIndex(sseq); seqStart < 0 { + seqStart = 0 + } + } + + isAll := sl == nil + + // See if filter was provided but its the only subject. + if !isAll && fs.psim.Size() == 1 { + fs.psim.Iter(func(subject []byte, _ *psi) bool { + isAll = sl.HasInterest(bytesToString(subject)) + return true + }) + } + // If we are isAll and have no deleted we can do a simpler calculation. + if !lastPerSubject && isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs { + if sseq == 0 { + return fs.state.Msgs, validThrough + } + return fs.state.LastSeq - sseq + 1, validThrough + } + // Setup the isMatch function. + isMatch := func(subj string) bool { + if isAll { + return true + } + return sl.HasInterest(subj) + } + + // Handle last by subject a bit differently. + // We will scan PSIM since we accurately track the last block we have seen the subject in. This + // allows us to only need to load at most one block now. + // For the last block, we need to track the subjects that we know are in that block, and track seen + // while in the block itself, but complexity there worth it. + if lastPerSubject { + // If we want all and our start sequence is equal or less than first return number of subjects. + if isAll && sseq <= fs.state.FirstSeq { + return uint64(fs.psim.Size()), validThrough + } + // If we are here we need to scan. We are going to scan the PSIM looking for lblks that are >= seqStart. + // This will build up a list of all subjects from the selected block onward. + lbm := make(map[string]bool) + mb := fs.blks[seqStart] + bi := mb.index + + subs := make([]*subscription, 0, sl.Count()) + sl.All(&subs) + for _, sub := range subs { + fs.psim.Match(sub.subject, func(subj []byte, psi *psi) { + // If the select blk start is greater than entry's last blk skip. + if bi > psi.lblk { + return + } + total++ + // We will track the subjects that are an exact match to the last block. + // This is needed for last block processing. + if psi.lblk == bi { + lbm[string(subj)] = true + } + }) + } + + // Now check if we need to inspect the seqStart block. + // Grab write lock in case we need to load in msgs. + mb.mu.Lock() + var shouldExpire bool + // We need to walk this block to correct accounting from above. + if sseq > mb.first.seq { + // Track the ones we add back in case more than one. + seen := make(map[string]bool) + // We need to discount the total by subjects seen before sseq, but also add them right back in if they are >= sseq for this blk. + // This only should be subjects we know have the last blk in this block. + if mb.cacheNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } + var smv StoreMsg + for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { + sm, _ := mb.cacheLookup(seq, &smv) + if sm == nil || sm.subj == _EMPTY_ || !lbm[sm.subj] { + continue + } + if isMatch(sm.subj) { + // If less than sseq adjust off of total as long as this subject matched the last block. + if seq < sseq { + if !seen[sm.subj] { + total-- + seen[sm.subj] = true + } + } else if seen[sm.subj] { + // This is equal or more than sseq, so add back in. + total++ + // Make sure to not process anymore. + delete(seen, sm.subj) + } + } + } + } + // If we loaded the block try to force expire. + if shouldExpire { + mb.tryForceExpireCacheLocked() + } + mb.mu.Unlock() + return total, validThrough + } + + // If we would need to scan more from the beginning, revert back to calculating directly here. + if seqStart >= (len(fs.blks) / 2) { + for i := seqStart; i < len(fs.blks); i++ { + var shouldExpire bool + mb := fs.blks[i] + // Hold write lock in case we need to load cache. + mb.mu.Lock() + if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) { + total += mb.msgs + mb.mu.Unlock() + continue + } + // If we are here we need to at least scan the subject fss. + // Make sure we have fss loaded. + if mb.fssNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } + // Mark fss activity. + mb.lsts = time.Now().UnixNano() + + var t uint64 + var havePartial bool + IntersectStree[SimpleState](mb.fss, sl, func(bsubj []byte, ss *SimpleState) { + subj := bytesToString(bsubj) + if havePartial { + // If we already found a partial then don't do anything else. + return + } + if ss.firstNeedsUpdate { + mb.recalculateFirstForSubj(subj, ss.First, ss) + } + if sseq <= ss.First { + t += ss.Msgs + } else if sseq <= ss.Last { + // We matched but its a partial. + havePartial = true + } + }) + + // See if we need to scan msgs here. + if havePartial { + // Make sure we have the cache loaded. + if mb.cacheNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } + // Clear on partial. + t = 0 + start := sseq + if fseq := atomic.LoadUint64(&mb.first.seq); fseq > start { + start = fseq + } + var smv StoreMsg + for seq, lseq := start, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { + if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && isMatch(sm.subj) { + t++ + } + } + } + // If we loaded this block for this operation go ahead and expire it here. + if shouldExpire { + mb.tryForceExpireCacheLocked() + } + mb.mu.Unlock() + total += t + } + return total, validThrough + } + + // If we are here it's better to calculate totals from psim and adjust downward by scanning less blocks. + start := uint32(math.MaxUint32) + subs := make([]*subscription, 0, sl.Count()) + sl.All(&subs) + for _, sub := range subs { + fs.psim.Match(sub.subject, func(_ []byte, psi *psi) { + total += psi.total + // Keep track of start index for this subject. + if psi.fblk < start { + start = psi.fblk + } + }) + } + // See if we were asked for all, if so we are done. + if sseq <= fs.state.FirstSeq { + return total, validThrough + } + + // If we are here we need to calculate partials for the first blocks. + firstSubjBlk := fs.bim[start] + var firstSubjBlkFound bool + // Adjust in case not found. + if firstSubjBlk == nil { + firstSubjBlkFound = true + } + + // Track how many we need to adjust against the total. + var adjust uint64 + for i := 0; i <= seqStart; i++ { + mb := fs.blks[i] + // We can skip blks if we know they are below the first one that has any subject matches. + if !firstSubjBlkFound { + if firstSubjBlkFound = (mb == firstSubjBlk); !firstSubjBlkFound { + continue + } + } + // We need to scan this block. + var shouldExpire bool + mb.mu.Lock() + // Check if we should include all of this block in adjusting. If so work with metadata. + if sseq > atomic.LoadUint64(&mb.last.seq) { + if isAll { + adjust += mb.msgs + } else { + // We need to adjust for all matches in this block. + // Make sure we have fss loaded. This loads whole block now. + if mb.fssNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } + // Mark fss activity. + mb.lsts = time.Now().UnixNano() + IntersectStree(mb.fss, sl, func(bsubj []byte, ss *SimpleState) { + adjust += ss.Msgs + }) + } + } else { + // This is the last block. We need to scan per message here. + if mb.cacheNotLoaded() { + mb.loadMsgsWithLock() + shouldExpire = true + } + var last = atomic.LoadUint64(&mb.last.seq) + if sseq < last { + last = sseq + } + // We need to walk all messages in this block + var smv StoreMsg + for seq := atomic.LoadUint64(&mb.first.seq); seq < last; seq++ { + sm, _ := mb.cacheLookup(seq, &smv) + if sm == nil || sm.subj == _EMPTY_ { + continue + } + // Check if it matches our filter. + if sm.seq < sseq && isMatch(sm.subj) { + adjust++ + } + } + } + // If we loaded the block try to force expire. + if shouldExpire { + mb.tryForceExpireCacheLocked() + } + mb.mu.Unlock() + } + // Make final adjustment. + total -= adjust + + return total, validThrough +} + // SubjectsTotal return message totals per subject. func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 { fs.mu.RLock() @@ -7259,16 +7568,22 @@ func (fs *fileStore) reset() error { } // Return all active tombstones in this msgBlock. -// Write lock should be held. func (mb *msgBlock) tombs() []msgId { - var tombs []msgId + mb.mu.Lock() + defer mb.mu.Unlock() + return mb.tombsLocked() +} - if !mb.cacheAlreadyLoaded() { +// Return all active tombstones in this msgBlock. +// Write lock should be held. +func (mb *msgBlock) tombsLocked() []msgId { + if mb.cacheNotLoaded() { if err := mb.loadMsgsWithLock(); err != nil { return nil } } + var tombs []msgId var le = binary.LittleEndian buf := mb.cache.buf @@ -7349,7 +7664,7 @@ func (fs *fileStore) Truncate(seq uint64) error { for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() { mb.mu.Lock() // We do this to load tombs. - tombs = append(tombs, mb.tombs()...) + tombs = append(tombs, mb.tombsLocked()...) purged += mb.msgs bytes += mb.bytes fs.removeMsgBlock(mb) @@ -7578,6 +7893,9 @@ func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *Si continue } ss.First = seq + if ss.Msgs == 1 { + ss.Last = seq + } return } } @@ -7824,7 +8142,11 @@ func (fs *fileStore) setSyncTimer() { if fs.syncTmr != nil { fs.syncTmr.Reset(fs.fcfg.SyncInterval) } else { - fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) + // First time this fires will be between SyncInterval/2 and SyncInterval, + // so that different stores are spread out, rather than having many of + // them trying to all sync at once, causing blips and contending dios. + start := (fs.fcfg.SyncInterval / 2) + (time.Duration(mrand.Int63n(int64(fs.fcfg.SyncInterval / 2)))) + fs.syncTmr = time.AfterFunc(start, fs.syncBlocks) } } @@ -7847,8 +8169,10 @@ func (fs *fileStore) flushStreamStateLoop(qch, done chan struct{}) { defer close(done) // Make sure we do not try to write these out too fast. + // Spread these out for large numbers on a server restart. const writeThreshold = 2 * time.Minute - t := time.NewTicker(writeThreshold) + writeJitter := time.Duration(mrand.Int63n(int64(30 * time.Second))) + t := time.NewTicker(writeThreshold + writeJitter) defer t.Stop() for { @@ -8037,7 +8361,7 @@ func (fs *fileStore) _writeFullState(force bool) error { // Snapshot prior dirty count. priorDirty := fs.dirty - statesEqual := trackingStatesEqual(&fs.state, &mstate) || len(fs.blks) > 0 + statesEqual := trackingStatesEqual(&fs.state, &mstate) // Release lock. fs.mu.Unlock() @@ -9010,14 +9334,6 @@ func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error { } func (o *consumerFileStore) Update(state *ConsumerState) error { - o.mu.Lock() - defer o.mu.Unlock() - - // Check to see if this is an outdated update. - if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream { - return nil - } - // Sanity checks. if state.AckFloor.Consumer > state.Delivered.Consumer { return fmt.Errorf("bad ack floor for consumer") @@ -9045,6 +9361,15 @@ func (o *consumerFileStore) Update(state *ConsumerState) error { } } + // Replace our state. + o.mu.Lock() + defer o.mu.Unlock() + + // Check to see if this is an outdated update. + if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream { + return fmt.Errorf("old update ignored") + } + o.state.Delivered = state.Delivered o.state.AckFloor = state.AckFloor o.state.Pending = pending @@ -9712,14 +10037,22 @@ func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) { // sets O_SYNC on the open file if SyncAlways is set. The dios semaphore is // handled automatically by this function, so don't wrap calls to it in dios. func (fs *fileStore) writeFileWithOptionalSync(name string, data []byte, perm fs.FileMode) error { + if fs.fcfg.SyncAlways { + return writeFileWithSync(name, data, perm) + } <-dios defer func() { dios <- struct{}{} }() - flags := os.O_WRONLY | os.O_CREATE | os.O_TRUNC - if fs.fcfg.SyncAlways { - flags |= os.O_SYNC - } + return os.WriteFile(name, data, perm) +} + +func writeFileWithSync(name string, data []byte, perm fs.FileMode) error { + <-dios + defer func() { + dios <- struct{}{} + }() + flags := os.O_WRONLY | os.O_CREATE | os.O_TRUNC | os.O_SYNC f, err := os.OpenFile(name, flags, perm) if err != nil { return err diff --git a/vendor/github.com/nats-io/nats-server/v2/server/gateway.go b/vendor/github.com/nats-io/nats-server/v2/server/gateway.go index 82df196e2f..46dd7260ec 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/gateway.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/gateway.go @@ -1900,7 +1900,7 @@ func (c *client) processGatewayAccountSub(accName string) error { // the sublist if present. // func (c *client) processGatewayRUnsub(arg []byte) error { - accName, subject, queue, err := c.parseUnsubProto(arg) + _, accName, subject, queue, err := c.parseUnsubProto(arg, true, false) if err != nil { return fmt.Errorf("processGatewaySubjectUnsub %s", err.Error()) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go index e3f073fa95..2e606e6a6f 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go @@ -461,6 +461,8 @@ func (s *Server) enableJetStream(cfg JetStreamConfig) error { if err := s.enableJetStreamClustering(); err != nil { return err } + // Set our atomic bool to clustered. + s.jsClustered.Store(true) } // Mark when we are up and running. @@ -965,6 +967,8 @@ func (s *Server) shutdownJetStream() { cc.c = nil } cc.meta = nil + // Set our atomic bool to false. + s.jsClustered.Store(false) } js.mu.Unlock() @@ -2103,7 +2107,7 @@ func (js *jetStream) wouldExceedLimits(storeType StorageType, sz int) bool { } else { total, max = &js.storeUsed, js.config.MaxStore } - return atomic.LoadInt64(total) > (max + int64(sz)) + return (atomic.LoadInt64(total) + int64(sz)) > max } func (js *jetStream) limitsExceeded(storeType StorageType) bool { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go index 27e8f4b626..de014e74b7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go @@ -2556,7 +2556,7 @@ func (s *Server) jsLeaderServerStreamMoveRequest(sub *subscription, c *client, _ cfg.Placement = origPlacement s.Noticef("Requested move for stream '%s > %s' R=%d from %+v to %+v", - streamName, accName, cfg.Replicas, s.peerSetToNames(currPeers), s.peerSetToNames(peers)) + accName, streamName, cfg.Replicas, s.peerSetToNames(currPeers), s.peerSetToNames(peers)) // We will always have peers and therefore never do a callout, therefore it is safe to call inline s.jsClusteredStreamUpdateRequest(&ciNew, targetAcc.(*Account), subject, reply, rmsg, &cfg, peers) @@ -2662,7 +2662,7 @@ func (s *Server) jsLeaderServerStreamCancelMoveRequest(sub *subscription, c *cli } s.Noticef("Requested cancel of move: R=%d '%s > %s' to peer set %+v and restore previous peer set %+v", - cfg.Replicas, streamName, accName, s.peerSetToNames(currPeers), s.peerSetToNames(peers)) + cfg.Replicas, accName, streamName, s.peerSetToNames(currPeers), s.peerSetToNames(peers)) // We will always have peers and therefore never do a callout, therefore it is safe to call inline s.jsClusteredStreamUpdateRequest(&ciNew, targetAcc.(*Account), subject, reply, rmsg, &cfg, peers) @@ -3557,7 +3557,7 @@ func (s *Server) processStreamRestore(ci *ClientInfo, acc *Account, cfg *StreamC if err != nil { resp.Error = NewJSStreamRestoreError(err, Unless(err)) s.Warnf("Restore failed for %s for stream '%s > %s' in %v", - friendlyBytes(int64(total)), streamName, acc.Name, end.Sub(start)) + friendlyBytes(int64(total)), acc.Name, streamName, end.Sub(start)) } else { resp.StreamInfo = &StreamInfo{ Created: mset.createdTime(), @@ -3566,7 +3566,7 @@ func (s *Server) processStreamRestore(ci *ClientInfo, acc *Account, cfg *StreamC TimeStamp: time.Now().UTC(), } s.Noticef("Completed restore of %s for stream '%s > %s' in %v", - friendlyBytes(int64(total)), streamName, acc.Name, end.Sub(start).Round(time.Millisecond)) + friendlyBytes(int64(total)), acc.Name, streamName, end.Sub(start).Round(time.Millisecond)) } // On the last EOF, send back the stream info or error status. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go index 9d7fc0550d..8f08b1e502 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go @@ -134,14 +134,15 @@ type streamAssignment struct { Config *StreamConfig `json:"stream"` Group *raftGroup `json:"group"` Sync string `json:"sync"` - Subject string `json:"subject"` - Reply string `json:"reply"` + Subject string `json:"subject,omitempty"` + Reply string `json:"reply,omitempty"` Restore *StreamState `json:"restore_state,omitempty"` // Internal - consumers map[string]*consumerAssignment - responded bool - recovering bool - err error + consumers map[string]*consumerAssignment + responded bool + recovering bool + reassigning bool // i.e. due to placement issues, lack of resources, etc. + err error } // consumerAssignment is what the meta controller uses to assign consumers to streams. @@ -152,12 +153,13 @@ type consumerAssignment struct { Stream string `json:"stream"` Config *ConsumerConfig `json:"consumer"` Group *raftGroup `json:"group"` - Subject string `json:"subject"` - Reply string `json:"reply"` + Subject string `json:"subject,omitempty"` + Reply string `json:"reply,omitempty"` State *ConsumerState `json:"state,omitempty"` // Internal responded bool recovering bool + pending bool deleted bool err error } @@ -222,11 +224,7 @@ func (s *Server) getJetStreamCluster() (*jetStream, *jetStreamCluster) { } func (s *Server) JetStreamIsClustered() bool { - js := s.getJetStream() - if js == nil { - return false - } - return js.isClustered() + return s.jsClustered.Load() } func (s *Server) JetStreamIsLeader() bool { @@ -780,10 +778,17 @@ func (js *jetStream) setupMetaGroup() error { // Setup our WAL for the metagroup. sysAcc := s.SystemAccount() + if sysAcc == nil { + return ErrNoSysAccount + } storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, defaultMetaGroupName) + js.srv.optsMu.RLock() + syncAlways := js.srv.opts.SyncAlways + syncInterval := js.srv.opts.SyncInterval + js.srv.optsMu.RUnlock() fs, err := newFileStoreWithCreated( - FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMetaFSBlkSize, AsyncFlush: false, srv: s}, + FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMetaFSBlkSize, AsyncFlush: false, SyncAlways: syncAlways, SyncInterval: syncInterval, srv: s}, StreamConfig{Name: defaultMetaGroupName, Storage: FileStorage}, time.Now().UTC(), s.jsKeyGen(s.getOpts().JetStreamKey, defaultMetaGroupName), @@ -1131,9 +1136,10 @@ func (js *jetStream) isMetaRecovering() bool { // During recovery track any stream and consumer delete and update operations. type recoveryUpdates struct { removeStreams map[string]*streamAssignment - removeConsumers map[string]*consumerAssignment + removeConsumers map[string]map[string]*consumerAssignment + addStreams map[string]*streamAssignment updateStreams map[string]*streamAssignment - updateConsumers map[string]*consumerAssignment + updateConsumers map[string]map[string]*consumerAssignment } // Called after recovery of the cluster on startup to check for any orphans. @@ -1310,7 +1316,7 @@ func (js *jetStream) monitorCluster() { isLeader bool lastSnapTime time.Time compactSizeMin = uint64(8 * 1024 * 1024) // 8MB - minSnapDelta = 10 * time.Second + minSnapDelta = 30 * time.Second ) // Highwayhash key for generating hashes. @@ -1338,9 +1344,10 @@ func (js *jetStream) monitorCluster() { ru := &recoveryUpdates{ removeStreams: make(map[string]*streamAssignment), - removeConsumers: make(map[string]*consumerAssignment), + removeConsumers: make(map[string]map[string]*consumerAssignment), + addStreams: make(map[string]*streamAssignment), updateStreams: make(map[string]*streamAssignment), - updateConsumers: make(map[string]*consumerAssignment), + updateConsumers: make(map[string]map[string]*consumerAssignment), } // Make sure to cancel any pending checkForOrphans calls if the @@ -1351,6 +1358,8 @@ func (js *jetStream) monitorCluster() { for { select { case <-s.quitCh: + // Server shutting down, but we might receive this before qch, so try to snapshot. + doSnapshot() return case <-rqch: return @@ -1364,23 +1373,31 @@ func (js *jetStream) monitorCluster() { ces := aq.pop() for _, ce := range ces { if ce == nil { - // Signals we have replayed all of our metadata. - js.clearMetaRecovering() // Process any removes that are still valid after recovery. - for _, ca := range ru.removeConsumers { - js.processConsumerRemoval(ca) + for _, cas := range ru.removeConsumers { + for _, ca := range cas { + js.processConsumerRemoval(ca) + } } for _, sa := range ru.removeStreams { js.processStreamRemoval(sa) } + // Process stream additions. + for _, sa := range ru.addStreams { + js.processStreamAssignment(sa) + } // Process pending updates. for _, sa := range ru.updateStreams { js.processUpdateStreamAssignment(sa) } // Now consumers. - for _, ca := range ru.updateConsumers { - js.processConsumerAssignment(ca) + for _, cas := range ru.updateConsumers { + for _, ca := range cas { + js.processConsumerAssignment(ca) + } } + // Signals we have replayed all of our metadata. + js.clearMetaRecovering() // Clear. ru = nil s.Debugf("Recovered JetStream cluster metadata") @@ -1389,12 +1406,14 @@ func (js *jetStream) monitorCluster() { go checkHealth() continue } - if didSnap, didStreamRemoval, didConsumerRemoval, err := js.applyMetaEntries(ce.Entries, ru); err == nil { - _, nb := n.Applied(ce.Index) + if didSnap, didStreamRemoval, _, err := js.applyMetaEntries(ce.Entries, ru); err == nil { + var nb uint64 + // Some entries can fail without an error when shutting down, don't move applied forward. + if !js.isShuttingDown() { + _, nb = n.Applied(ce.Index) + } if js.hasPeerEntries(ce.Entries) || didStreamRemoval || (didSnap && !isLeader) { doSnapshot() - } else if didConsumerRemoval && time.Since(lastSnapTime) > minSnapDelta/2 { - doSnapshot() } else if nb > compactSizeMin && time.Since(lastSnapTime) > minSnapDelta { doSnapshot() } @@ -1406,10 +1425,6 @@ func (js *jetStream) monitorCluster() { aq.recycle(&ces) case isLeader = <-lch: - // For meta layer synchronize everyone to our state on becoming leader. - if isLeader && n.ApplyQ().len() == 0 { - n.SendSnapshot(js.metaSnapshot()) - } // Process the change. js.processLeaderChange(isLeader) if isLeader { @@ -1514,9 +1529,12 @@ func (js *jetStream) clusterStreamConfig(accName, streamName string) (StreamConf } func (js *jetStream) metaSnapshot() []byte { + start := time.Now() js.mu.RLock() + s := js.srv cc := js.cluster nsa := 0 + nca := 0 for _, asa := range cc.streams { nsa += len(asa) } @@ -1524,7 +1542,7 @@ func (js *jetStream) metaSnapshot() []byte { for _, asa := range cc.streams { for _, sa := range asa { wsa := writeableStreamAssignment{ - Client: sa.Client, + Client: sa.Client.forAssignmentSnap(), Created: sa.Created, Config: sa.Config, Group: sa.Group, @@ -1532,7 +1550,17 @@ func (js *jetStream) metaSnapshot() []byte { Consumers: make([]*consumerAssignment, 0, len(sa.consumers)), } for _, ca := range sa.consumers { - wsa.Consumers = append(wsa.Consumers, ca) + // Skip if the consumer is pending, we can't include it in our snapshot. + // If the proposal fails after we marked it pending, it would result in a ghost consumer. + if ca.pending { + continue + } + cca := *ca + cca.Stream = wsa.Config.Name // Needed for safe roll-backs. + cca.Client = cca.Client.forAssignmentSnap() + cca.Subject, cca.Reply = _EMPTY_, _EMPTY_ + wsa.Consumers = append(wsa.Consumers, &cca) + nca++ } streams = append(streams, wsa) } @@ -1543,10 +1571,23 @@ func (js *jetStream) metaSnapshot() []byte { return nil } + // Track how long it took to marshal the JSON + mstart := time.Now() b, _ := json.Marshal(streams) + mend := time.Since(mstart) + js.mu.RUnlock() - return s2.EncodeBetter(nil, b) + // Track how long it took to compress the JSON + cstart := time.Now() + snap := s2.Encode(nil, b) + cend := time.Since(cstart) + + if took := time.Since(start); took > time.Second { + s.rateLimitFormatWarnf("Metalayer snapshot took %.3fs (streams: %d, consumers: %d, marshal: %.3fs, s2: %.3fs, uncompressed: %d, compressed: %d)", + took.Seconds(), nsa, nca, mend.Seconds(), cend.Seconds(), len(b), len(snap)) + } + return snap } func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecovering bool) error { @@ -1574,6 +1615,9 @@ func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecove if len(wsa.Consumers) > 0 { sa.consumers = make(map[string]*consumerAssignment) for _, ca := range wsa.Consumers { + if ca.Stream == _EMPTY_ { + ca.Stream = sa.Config.Name // Rehydrate from the stream name. + } sa.consumers[ca.Name] = ca } } @@ -1630,7 +1674,10 @@ func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecove if isRecovering { key := sa.recoveryKey() ru.removeStreams[key] = sa + delete(ru.addStreams, key) delete(ru.updateStreams, key) + delete(ru.updateConsumers, key) + delete(ru.removeConsumers, key) } else { js.processStreamRemoval(sa) } @@ -1654,6 +1701,7 @@ func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecove if isRecovering { key := sa.recoveryKey() ru.updateStreams[key] = sa + delete(ru.addStreams, key) delete(ru.removeStreams, key) } else { js.processUpdateStreamAssignment(sa) @@ -1665,8 +1713,14 @@ func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecove js.setConsumerAssignmentRecovering(ca) if isRecovering { key := ca.recoveryKey() - ru.removeConsumers[key] = ca - delete(ru.updateConsumers, key) + skey := ca.streamRecoveryKey() + if _, ok := ru.removeConsumers[skey]; !ok { + ru.removeConsumers[skey] = map[string]*consumerAssignment{} + } + ru.removeConsumers[skey][key] = ca + if consumers, ok := ru.updateConsumers[skey]; ok { + delete(consumers, key) + } } else { js.processConsumerRemoval(ca) } @@ -1675,8 +1729,14 @@ func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecove js.setConsumerAssignmentRecovering(ca) if isRecovering { key := ca.recoveryKey() - delete(ru.removeConsumers, key) - ru.updateConsumers[key] = ca + skey := ca.streamRecoveryKey() + if consumers, ok := ru.removeConsumers[skey]; ok { + delete(consumers, key) + } + if _, ok := ru.updateConsumers[skey]; !ok { + ru.updateConsumers[skey] = map[string]*consumerAssignment{} + } + ru.updateConsumers[skey][key] = ca } else { js.processConsumerAssignment(ca) } @@ -1889,6 +1949,13 @@ func (sa *streamAssignment) recoveryKey() string { return sa.Client.serviceAccount() + ksep + sa.Config.Name } +func (ca *consumerAssignment) streamRecoveryKey() string { + if ca == nil { + return _EMPTY_ + } + return ca.Client.serviceAccount() + ksep + ca.Stream +} + func (ca *consumerAssignment) recoveryKey() string { if ca == nil { return _EMPTY_ @@ -1923,9 +1990,10 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bo } if isRecovering { js.setStreamAssignmentRecovering(sa) - delete(ru.removeStreams, sa.recoveryKey()) - } - if js.processStreamAssignment(sa) { + key := sa.recoveryKey() + ru.addStreams[key] = sa + delete(ru.removeStreams, key) + } else if js.processStreamAssignment(sa) { didRemoveStream = true } case removeStreamOp: @@ -1938,7 +2006,10 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bo js.setStreamAssignmentRecovering(sa) key := sa.recoveryKey() ru.removeStreams[key] = sa + delete(ru.addStreams, key) delete(ru.updateStreams, key) + delete(ru.updateConsumers, key) + delete(ru.removeConsumers, key) } else { js.processStreamRemoval(sa) didRemoveStream = true @@ -1952,8 +2023,14 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bo if isRecovering { js.setConsumerAssignmentRecovering(ca) key := ca.recoveryKey() - delete(ru.removeConsumers, key) - ru.updateConsumers[key] = ca + skey := ca.streamRecoveryKey() + if consumers, ok := ru.removeConsumers[skey]; ok { + delete(consumers, key) + } + if _, ok := ru.updateConsumers[skey]; !ok { + ru.updateConsumers[skey] = map[string]*consumerAssignment{} + } + ru.updateConsumers[skey][key] = ca } else { js.processConsumerAssignment(ca) } @@ -1966,8 +2043,14 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bo if isRecovering { js.setConsumerAssignmentRecovering(ca) key := ca.recoveryKey() - delete(ru.removeConsumers, key) - ru.updateConsumers[key] = ca + skey := ca.streamRecoveryKey() + if consumers, ok := ru.removeConsumers[skey]; ok { + delete(consumers, key) + } + if _, ok := ru.updateConsumers[skey]; !ok { + ru.updateConsumers[skey] = map[string]*consumerAssignment{} + } + ru.updateConsumers[skey][key] = ca } else { js.processConsumerAssignment(ca) } @@ -1980,8 +2063,14 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bo if isRecovering { js.setConsumerAssignmentRecovering(ca) key := ca.recoveryKey() - ru.removeConsumers[key] = ca - delete(ru.updateConsumers, key) + skey := ca.streamRecoveryKey() + if _, ok := ru.removeConsumers[skey]; !ok { + ru.removeConsumers[skey] = map[string]*consumerAssignment{} + } + ru.removeConsumers[skey][key] = ca + if consumers, ok := ru.updateConsumers[skey]; ok { + delete(consumers, key) + } } else { js.processConsumerRemoval(ca) didRemoveConsumer = true @@ -1996,6 +2085,7 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bo js.setStreamAssignmentRecovering(sa) key := sa.recoveryKey() ru.updateStreams[key] = sa + delete(ru.addStreams, key) delete(ru.removeStreams, key) } else { js.processUpdateStreamAssignment(sa) @@ -2053,8 +2143,32 @@ func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage Stor } // Check if we already have this assigned. +retry: if node := s.lookupRaftNode(rg.Name); node != nil { + if node.State() == Closed { + // We're waiting for this node to finish shutting down before we replace it. + js.mu.Unlock() + node.WaitForStop() + js.mu.Lock() + goto retry + } s.Debugf("JetStream cluster already has raft group %q assigned", rg.Name) + // Check and see if the group has the same peers. If not then we + // will update the known peers, which will send a peerstate if leader. + groupPeerIDs := append([]string{}, rg.Peers...) + var samePeers bool + if nodePeers := node.Peers(); len(rg.Peers) == len(nodePeers) { + nodePeerIDs := make([]string, 0, len(nodePeers)) + for _, n := range nodePeers { + nodePeerIDs = append(nodePeerIDs, n.ID) + } + slices.Sort(groupPeerIDs) + slices.Sort(nodePeerIDs) + samePeers = slices.Equal(groupPeerIDs, nodePeerIDs) + } + if !samePeers { + node.UpdateKnownPeers(groupPeerIDs) + } rg.node = node js.mu.Unlock() return nil @@ -2082,8 +2196,13 @@ func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage Stor storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, rg.Name) var store StreamStore if storage == FileStorage { + // If the server is set to sync always, do the same for the Raft log. + js.srv.optsMu.RLock() + syncAlways := js.srv.opts.SyncAlways + syncInterval := js.srv.opts.SyncInterval + js.srv.optsMu.RUnlock() fs, err := newFileStoreWithCreated( - FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncInterval: 5 * time.Minute, srv: s}, + FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncAlways: syncAlways, SyncInterval: syncInterval, srv: s}, StreamConfig{Name: rg.Name, Storage: FileStorage, Metadata: labels}, time.Now().UTC(), s.jsKeyGen(s.getOpts().JetStreamKey, rg.Name), @@ -2324,7 +2443,6 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps // fully recovered from disk. isRecovering := true - // Should only to be called from leader. doSnapshot := func() { if mset == nil || isRecovering || isRestore || time.Since(lastSnapTime) < minSnapDelta { return @@ -2834,7 +2952,7 @@ func (mset *stream) resetClusteredState(err error) bool { // If we detect we are shutting down just return. if js != nil && js.isShuttingDown() { - s.Debugf("Will not reset stream, jetstream shutting down") + s.Debugf("Will not reset stream, JetStream shutting down") return false } @@ -3835,6 +3953,14 @@ func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignme // This is an error condition. if err != nil { + // If we're shutting down we could get a variety of errors, for example: + // 'JetStream not enabled for account' when looking up the stream. + // Normally we can continue and delete state, but need to be careful when shutting down. + if js.isShuttingDown() { + s.Debugf("Could not create stream, JetStream shutting down") + return + } + if IsNatsErr(err, JSStreamStoreFailedF) { s.Warnf("Stream create failed for '%s > %s': %v", sa.Client.serviceAccount(), sa.Config.Name, err) err = errStreamStoreFailed @@ -4129,8 +4255,10 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { return } + js.mu.Lock() sa := js.streamAssignment(accName, stream) if sa == nil { + js.mu.Unlock() s.Debugf("Consumer create failed, could not locate stream '%s > %s'", accName, stream) return } @@ -4142,7 +4270,6 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { var wasExisting bool // Check if we have an existing consumer assignment. - js.mu.Lock() if sa.consumers == nil { sa.consumers = make(map[string]*consumerAssignment) } else if oca := sa.consumers[ca.Name]; oca != nil { @@ -4163,6 +4290,7 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { // Place into our internal map under the stream assignment. // Ok to replace an existing one, we check on process call below. sa.consumers[ca.Name] = ca + ca.pending = false js.mu.Unlock() acc, err := s.LookupAccount(accName) @@ -4426,6 +4554,13 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state } if err != nil { + // If we're shutting down we could get a variety of errors. + // Normally we can continue and delete state, but need to be careful when shutting down. + if js.isShuttingDown() { + s.Debugf("Could not create consumer, JetStream shutting down") + return + } + if IsNatsErr(err, JSConsumerStoreFailedErrF) { s.Warnf("Consumer create failed for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err) err = errConsumerStoreFailed @@ -4821,7 +4956,11 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { doSnapshot(true) } } else if err := js.applyConsumerEntries(o, ce, isLeader); err == nil { - ne, nb := n.Applied(ce.Index) + var ne, nb uint64 + // We can't guarantee writes are flushed while we're shutting down. Just rely on replay during recovery. + if !js.isShuttingDown() { + ne, nb = n.Applied(ce.Index) + } ce.ReturnToPool() // If we have at least min entries to compact, go ahead and snapshot/compact. if nb > 0 && ne >= compactNumMin || nb > compactSizeMin { @@ -4838,23 +4977,13 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { } // Process the change. - if err := js.processConsumerLeaderChange(o, isLeader); err == nil && isLeader { + if err := js.processConsumerLeaderChange(o, isLeader); err == nil { // Check our state if we are under an interest based stream. if mset := o.getStream(); mset != nil { var ss StreamState mset.store.FastState(&ss) o.checkStateForInterestStream(&ss) } - // Do a snapshot. - doSnapshot(true) - // Synchronize followers to our state. Only send out if we have state and nothing pending. - if n != nil { - if _, _, applied := n.Progress(); applied > 0 && aq.len() == 0 { - if snap, err := o.store.EncodedState(); err == nil { - n.SendSnapshot(snap) - } - } - } } // We may receive a leader change after the consumer assignment which would cancel us @@ -4962,6 +5091,7 @@ func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLea } panic(err.Error()) } + if err = o.store.Update(state); err != nil { o.mu.RLock() s, acc, mset, name := o.srv, o.acc, o.mset, o.name @@ -4974,17 +5104,10 @@ func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLea if mset := o.getStream(); mset != nil { var ss StreamState mset.store.FastState(&ss) - if err := o.checkStateForInterestStream(&ss); err == errAckFloorHigherThanLastSeq { - // Register pre-acks unless no state at all for the stream and we would create alot of pre-acks. - mset.mu.Lock() - // Only register if we have a valid FirstSeq. - if ss.FirstSeq > 0 { - for seq := ss.FirstSeq; seq < state.AckFloor.Stream; seq++ { - mset.registerPreAck(o, seq) - } - } - mset.mu.Unlock() - } + // We used to register preacks here if our ack floor was higher than the last sequence. + // Now when streams catch up they properly call checkInterestState() and periodically run this as well. + // If our states drift this could have allocated lots of pre-acks. + o.checkStateForInterestStream(&ss) } } @@ -5015,25 +5138,22 @@ func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLea buf := e.Data switch entryOp(buf[0]) { case updateDeliveredOp: - // These are handled in place in leaders. - if !isLeader { - dseq, sseq, dc, ts, err := decodeDeliveredUpdate(buf[1:]) - if err != nil { - if mset, node := o.streamAndNode(); mset != nil && node != nil { - s := js.srv - s.Errorf("JetStream cluster could not decode consumer delivered update for '%s > %s > %s' [%s]", - mset.account(), mset.name(), o, node.Group()) - } - panic(err.Error()) - } - // Make sure to update delivered under the lock. - o.mu.Lock() - err = o.store.UpdateDelivered(dseq, sseq, dc, ts) - o.ldt = time.Now() - o.mu.Unlock() - if err != nil { - panic(err.Error()) + dseq, sseq, dc, ts, err := decodeDeliveredUpdate(buf[1:]) + if err != nil { + if mset, node := o.streamAndNode(); mset != nil && node != nil { + s := js.srv + s.Errorf("JetStream cluster could not decode consumer delivered update for '%s > %s > %s' [%s]", + mset.account(), mset.name(), o, node.Group()) } + panic(err.Error()) + } + // Make sure to update delivered under the lock. + o.mu.Lock() + err = o.store.UpdateDelivered(dseq, sseq, dc, ts) + o.ldt = time.Now() + o.mu.Unlock() + if err != nil { + panic(err.Error()) } case updateAcksOp: dseq, sseq, err := decodeAckUpdate(buf[1:]) @@ -5359,8 +5479,7 @@ func (js *jetStream) processStreamAssignmentResults(sub *subscription, c *client // then we will do the proper thing. Otherwise will be a no-op. cc.removeInflightProposal(result.Account, result.Stream) - // FIXME(dlc) - suppress duplicates? - if sa := js.streamAssignment(result.Account, result.Stream); sa != nil { + if sa := js.streamAssignment(result.Account, result.Stream); sa != nil && !sa.reassigning { canDelete := !result.Update && time.Since(sa.Created) < 5*time.Second // See if we should retry in case this cluster is full but there are others. @@ -5386,6 +5505,10 @@ func (js *jetStream) processStreamAssignmentResults(sub *subscription, c *client // Propose new. sa.Group, sa.err = rg, nil cc.meta.Propose(encodeAddStreamAssignment(sa)) + // When the new stream assignment is processed, sa.reassigning will be + // automatically set back to false. Until then, don't process any more + // assignment results. + sa.reassigning = true return } } @@ -6185,6 +6308,10 @@ func sysRequest[T any](s *Server, subjFormat string, args ...any) (*T, error) { isubj := fmt.Sprintf(subjFormat, args...) s.mu.Lock() + if s.sys == nil { + s.mu.Unlock() + return nil, ErrNoSysAccount + } inbox := s.newRespInbox() results := make(chan *T, 1) s.sys.replies[inbox] = func(_ *subscription, _ *client, _ *Account, _, _ string, msg []byte) { @@ -7532,14 +7659,15 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec ca = nca } - // Mark this as pending. - if sa.consumers == nil { - sa.consumers = make(map[string]*consumerAssignment) - } - sa.consumers[ca.Name] = ca - // Do formal proposal. - cc.meta.Propose(encodeAddConsumerAssignment(ca)) + if err := cc.meta.Propose(encodeAddConsumerAssignment(ca)); err == nil { + // Mark this as pending. + if sa.consumers == nil { + sa.consumers = make(map[string]*consumerAssignment) + } + ca.pending = true + sa.consumers[ca.Name] = ca + } } func encodeAddConsumerAssignment(ca *consumerAssignment) []byte { @@ -7655,54 +7783,46 @@ const compressThreshold = 8192 // 8k // If allowed and contents over the threshold we will compress. func encodeStreamMsgAllowCompress(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, compressOK bool) []byte { - shouldCompress := compressOK && len(subject)+len(reply)+len(hdr)+len(msg) > compressThreshold + // Clip the subject, reply, header and msgs down. Operate on + // uint64 lengths to avoid overflowing. + slen := min(uint64(len(subject)), math.MaxUint16) + rlen := min(uint64(len(reply)), math.MaxUint16) + hlen := min(uint64(len(hdr)), math.MaxUint16) + mlen := min(uint64(len(msg)), math.MaxUint32) + total := slen + rlen + hlen + mlen - elen := 1 + 8 + 8 + len(subject) + len(reply) + len(hdr) + len(msg) + shouldCompress := compressOK && total > compressThreshold + elen := int(1 + 8 + 8 + total) elen += (2 + 2 + 2 + 4) // Encoded lengths, 4bytes - // TODO(dlc) - check sizes of subject, reply and hdr, make sure uint16 ok. - buf := make([]byte, elen) + + buf := make([]byte, 1, elen) buf[0] = byte(streamMsgOp) + var le = binary.LittleEndian - wi := 1 - le.PutUint64(buf[wi:], lseq) - wi += 8 - le.PutUint64(buf[wi:], uint64(ts)) - wi += 8 - le.PutUint16(buf[wi:], uint16(len(subject))) - wi += 2 - copy(buf[wi:], subject) - wi += len(subject) - le.PutUint16(buf[wi:], uint16(len(reply))) - wi += 2 - copy(buf[wi:], reply) - wi += len(reply) - le.PutUint16(buf[wi:], uint16(len(hdr))) - wi += 2 - if len(hdr) > 0 { - copy(buf[wi:], hdr) - wi += len(hdr) - } - le.PutUint32(buf[wi:], uint32(len(msg))) - wi += 4 - if len(msg) > 0 { - copy(buf[wi:], msg) - wi += len(msg) - } + buf = le.AppendUint64(buf, lseq) + buf = le.AppendUint64(buf, uint64(ts)) + buf = le.AppendUint16(buf, uint16(slen)) + buf = append(buf, subject[:slen]...) + buf = le.AppendUint16(buf, uint16(rlen)) + buf = append(buf, reply[:rlen]...) + buf = le.AppendUint16(buf, uint16(hlen)) + buf = append(buf, hdr[:hlen]...) + buf = le.AppendUint32(buf, uint32(mlen)) + buf = append(buf, msg[:mlen]...) // Check if we should compress. if shouldCompress { nbuf := make([]byte, s2.MaxEncodedLen(elen)) nbuf[0] = byte(compressedStreamMsgOp) - ebuf := s2.Encode(nbuf[1:], buf[1:wi]) - // Only pay cost of decode the other side if we compressed. + ebuf := s2.Encode(nbuf[1:], buf[1:]) + // Only pay the cost of decode on the other side if we compressed. // S2 will allow us to try without major penalty for non-compressable data. - if len(ebuf) < wi { - nbuf = nbuf[:len(ebuf)+1] - buf, wi = nbuf, len(nbuf) + if len(ebuf) < len(buf) { + buf = nbuf[:len(ebuf)+1] } } - return buf[:wi] + return buf } // Determine if all peers in our set support the binary snapshot. @@ -7865,7 +7985,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive. if maxMsgSize >= 0 && (len(hdr)+len(msg)) > maxMsgSize { err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) - s.RateLimitWarnf(err.Error()) + s.RateLimitWarnf("%s", err.Error()) if canRespond { var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} resp.Error = NewJSStreamMessageExceedsMaximumError() @@ -7882,7 +8002,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // Again this works if it goes through but better to be pre-emptive. if len(hdr) > math.MaxUint16 { err := fmt.Errorf("JetStream header size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) - s.RateLimitWarnf(err.Error()) + s.RateLimitWarnf("%s", err.Error()) if canRespond { var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} resp.Error = NewJSStreamHeaderExceedsMaximumError() @@ -8014,7 +8134,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured. if mset.clseq-(lseq+mset.clfs) > streamLagWarnThreshold { lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name) - s.RateLimitWarnf(lerr.Error()) + s.RateLimitWarnf("%s", lerr.Error()) } mset.clMu.Unlock() @@ -8290,7 +8410,16 @@ RETRY: releaseSyncOutSem() if n.GroupLeader() == _EMPTY_ { - return fmt.Errorf("%w for stream '%s > %s'", errCatchupAbortedNoLeader, mset.account(), mset.name()) + // Prevent us from spinning if we've installed a snapshot from a leader but there's no leader online. + // We wait a bit to check if a leader has come online in the meantime, if so we can continue. + var canContinue bool + if numRetries == 0 { + time.Sleep(startInterval) + canContinue = n.GroupLeader() != _EMPTY_ + } + if !canContinue { + return fmt.Errorf("%w for stream '%s > %s'", errCatchupAbortedNoLeader, mset.account(), mset.name()) + } } // If we have a sub clear that here. @@ -8873,17 +9002,6 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { // mset.store never changes after being set, don't need lock. mset.store.FastState(&state) - // Reset notion of first if this request wants sequences before our starting sequence - // and we would have nothing to send. If we have partial messages still need to send skips for those. - // We will keep sreq's first sequence to not create sequence mismatches on the follower, but we extend the last to our current state. - if sreq.FirstSeq < state.FirstSeq && state.FirstSeq > sreq.LastSeq { - s.Debugf("Catchup for stream '%s > %s' resetting request first sequence from %d to %d", - mset.account(), mset.name(), sreq.FirstSeq, state.FirstSeq) - if state.LastSeq > sreq.LastSeq { - sreq.LastSeq = state.LastSeq - } - } - // Setup sequences to walk through. seq, last := sreq.FirstSeq, sreq.LastSeq mset.setCatchupPeer(sreq.Peer, last-seq) @@ -8972,20 +9090,26 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs && s.gcbBelowMax(); seq++ { var sm *StoreMsg var err error - // Is we should use load next do so here. + // If we should use load next do so here. if useLoadNext { var nseq uint64 sm, nseq, err = mset.store.LoadNextMsg(fwcs, true, seq, &smv) if err == nil && nseq > seq { + // If we jumped over the requested last sequence, clamp it down. + // Otherwise, we would send too much to the follower. + if nseq > last { + nseq = last + sm = nil + } dr.First, dr.Num = seq, nseq-seq // Jump ahead seq = nseq } else if err == ErrStoreEOF { - dr.First, dr.Num = seq, state.LastSeq-seq + dr.First, dr.Num = seq, last-seq // Clear EOF here for normal processing. err = nil // Jump ahead - seq = state.LastSeq + seq = last } } else { sm, err = mset.store.LoadMsg(seq, &smv) @@ -9047,25 +9171,10 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { if drOk && dr.First > 0 { sendDR() } - // Check for a condition where our state's first is now past the last that we could have sent. - // If so reset last and continue sending. - var state StreamState - mset.mu.RLock() - mset.store.FastState(&state) - mset.mu.RUnlock() - if last < state.FirstSeq { - last = state.LastSeq - } - // Recheck our exit condition. - if seq == last { - if drOk && dr.First > 0 { - sendDR() - } - s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) - // EOF - s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) - return false - } + s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) + // EOF + s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) + return false } select { case <-remoteQuitCh: diff --git a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go index e40cfcab89..26a3f6ec3d 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go @@ -774,7 +774,7 @@ func (s *Server) startLeafNodeAcceptLoop() { } // RegEx to match a creds file with user JWT and Seed. -var credsRe = regexp.MustCompile(`\s*(?:(?:[-]{3,}[^\n]*[-]{3,}\n)(.+)(?:\n\s*[-]{3,}[^\n]*[-]{3,}\n))`) +var credsRe = regexp.MustCompile(`\s*(?:(?:[-]{3,}.*[-]{3,}\r?\n)([\w\-.=]+)(?:\r?\n[-]{3,}.*[-]{3,}(\r?\n|\z)))`) // clusterName is provided as argument to avoid lock ordering issues with the locked client c // Lock should be held entering here. @@ -2271,6 +2271,42 @@ func keyFromSub(sub *subscription) string { return sb.String() } +const ( + keyRoutedSub = "R" + keyRoutedSubByte = 'R' + keyRoutedLeafSub = "L" + keyRoutedLeafSubByte = 'L' +) + +// Helper function to build the key that prevents collisions between normal +// routed subscriptions and routed subscriptions on behalf of a leafnode. +// Keys will look like this: +// "R foo" -> plain routed sub on "foo" +// "R foo bar" -> queue routed sub on "foo", queue "bar" +// "L foo bar" -> plain routed leaf sub on "foo", leaf "bar" +// "L foo bar baz" -> queue routed sub on "foo", queue "bar", leaf "baz" +func keyFromSubWithOrigin(sub *subscription) string { + var sb strings.Builder + sb.Grow(2 + len(sub.origin) + 1 + len(sub.subject) + 1 + len(sub.queue)) + leaf := len(sub.origin) > 0 + if leaf { + sb.WriteByte(keyRoutedLeafSubByte) + } else { + sb.WriteByte(keyRoutedSubByte) + } + sb.WriteByte(' ') + sb.Write(sub.subject) + if sub.queue != nil { + sb.WriteByte(' ') + sb.Write(sub.queue) + } + if leaf { + sb.WriteByte(' ') + sb.Write(sub.origin) + } + return sb.String() +} + // Lock should be held. func (c *client) writeLeafSub(w *bytes.Buffer, key string, n int32) { if key == _EMPTY_ { @@ -2321,12 +2357,21 @@ func (c *client) processLeafSub(argo []byte) (err error) { args := splitArg(arg) sub := &subscription{client: c} + delta := int32(1) switch len(args) { case 1: sub.queue = nil case 3: sub.queue = args[1] sub.qw = int32(parseSize(args[2])) + // TODO: (ik) We should have a non empty queue name and a queue + // weight >= 1. For 2.11, we may want to return an error if that + // is not the case, but for now just overwrite `delta` if queue + // weight is greater than 1 (it is possible after a reconnect/ + // server restart to receive a queue weight > 1 for a new sub). + if sub.qw > 1 { + delta = sub.qw + } default: return fmt.Errorf("processLeafSub Parse Error: '%s'", arg) } @@ -2391,7 +2436,6 @@ func (c *client) processLeafSub(argo []byte) (err error) { key := bytesToString(sub.sid) osub := c.subs[key] updateGWs := false - delta := int32(1) if osub == nil { c.subs[key] = sub // Now place into the account sl. @@ -2472,6 +2516,10 @@ func (c *client) processLeafUnsub(arg []byte) error { // We store local subs by account and subject and optionally queue name. // LS- will have the arg exactly as the key. sub, ok := c.subs[string(arg)] + delta := int32(1) + if ok && len(sub.queue) > 0 { + delta = sub.qw + } c.mu.Unlock() if ok { @@ -2481,14 +2529,14 @@ func (c *client) processLeafUnsub(arg []byte) error { if !spoke { // If we are routing subtract from the route map for the associated account. - srv.updateRouteSubscriptionMap(acc, sub, -1) + srv.updateRouteSubscriptionMap(acc, sub, -delta) // Gateways if updateGWs { - srv.gatewayUpdateSubInterest(acc.Name, sub, -1) + srv.gatewayUpdateSubInterest(acc.Name, sub, -delta) } } // Now check on leafnode updates for other leaf nodes. - acc.updateLeafNodes(sub, -1) + acc.updateLeafNodes(sub, -delta) return nil } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go index 8cd9070eb7..e2ca1cae29 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go @@ -84,10 +84,13 @@ func (ms *memStore) UpdateConfig(cfg *StreamConfig) error { ms.ageChk = nil } // Make sure to update MaxMsgsPer + if cfg.MaxMsgsPer < -1 { + cfg.MaxMsgsPer = -1 + } maxp := ms.maxp ms.maxp = cfg.MaxMsgsPer - // If the value is smaller we need to enforce that. - if ms.maxp != 0 && ms.maxp < maxp { + // If the value is smaller, or was unset before, we need to enforce that. + if ms.maxp > 0 && (maxp == 0 || ms.maxp < maxp) { lm := uint64(ms.maxp) ms.fss.Iter(func(subj []byte, ss *SimpleState) bool { if ss.Msgs > lm { @@ -359,15 +362,13 @@ func (ms *memStore) FilteredState(sseq uint64, subj string) SimpleState { } func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubject bool) SimpleState { - var ss SimpleState - if sseq < ms.state.FirstSeq { sseq = ms.state.FirstSeq } // If past the end no results. if sseq > ms.state.LastSeq { - return ss + return SimpleState{} } if filter == _EMPTY_ { @@ -391,9 +392,10 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje _tsa, _fsa := [32]string{}, [32]string{} tsa, fsa := _tsa[:0], _fsa[:0] - fsa = tokenizeSubjectIntoSlice(fsa[:0], filter) wc := subjectHasWildcard(filter) - + if wc { + fsa = tokenizeSubjectIntoSlice(fsa[:0], filter) + } // 1. See if we match any subs from fss. // 2. If we match and the sseq is past ss.Last then we can use meta only. // 3. If we match we need to do a partial, break and clear any totals and do a full scan like num pending. @@ -409,6 +411,7 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje return isSubsetMatchTokenized(tsa, fsa) } + var ss SimpleState update := func(fss *SimpleState) { msgs, first, last := fss.Msgs, fss.First, fss.Last if lastPerSubject { @@ -424,6 +427,7 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje } var havePartial bool + var totalSkipped uint64 // We will track start and end sequences as we go. ms.fss.Match(stringToBytes(filter), func(subj []byte, fss *SimpleState) { if fss.firstNeedsUpdate { @@ -436,6 +440,8 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje havePartial = true // Don't break here, we will update to keep tracking last. update(fss) + } else { + totalSkipped += fss.Msgs } }) @@ -492,6 +498,7 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje } else { // We will adjust from the totals above by scanning what we need to exclude. ss.First = first + ss.Msgs += totalSkipped var adjust uint64 var tss *SimpleState @@ -563,8 +570,9 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje // SubjectsState returns a map of SimpleState for all matching subjects. func (ms *memStore) SubjectsState(subject string) map[string]SimpleState { - ms.mu.RLock() - defer ms.mu.RUnlock() + // This needs to be a write lock, as we can mutate the per-subject state. + ms.mu.Lock() + defer ms.mu.Unlock() if ms.fss.Size() == 0 { return nil @@ -630,6 +638,154 @@ func (ms *memStore) NumPending(sseq uint64, filter string, lastPerSubject bool) return ss.Msgs, ms.state.LastSeq } +// NumPending will return the number of pending messages matching any subject in the sublist starting at sequence. +func (ms *memStore) NumPendingMulti(sseq uint64, sl *Sublist, lastPerSubject bool) (total, validThrough uint64) { + if sl == nil { + return ms.NumPending(sseq, fwcs, lastPerSubject) + } + + // This needs to be a write lock, as we can mutate the per-subject state. + ms.mu.Lock() + defer ms.mu.Unlock() + + var ss SimpleState + if sseq < ms.state.FirstSeq { + sseq = ms.state.FirstSeq + } + // If past the end no results. + if sseq > ms.state.LastSeq { + return 0, ms.state.LastSeq + } + + update := func(fss *SimpleState) { + msgs, first, last := fss.Msgs, fss.First, fss.Last + if lastPerSubject { + msgs, first = 1, last + } + ss.Msgs += msgs + if ss.First == 0 || first < ss.First { + ss.First = first + } + if last > ss.Last { + ss.Last = last + } + } + + var havePartial bool + var totalSkipped uint64 + // We will track start and end sequences as we go. + IntersectStree[SimpleState](ms.fss, sl, func(subj []byte, fss *SimpleState) { + if fss.firstNeedsUpdate { + ms.recalculateFirstForSubj(bytesToString(subj), fss.First, fss) + } + if sseq <= fss.First { + update(fss) + } else if sseq <= fss.Last { + // We matched but it is a partial. + havePartial = true + // Don't break here, we will update to keep tracking last. + update(fss) + } else { + totalSkipped += fss.Msgs + } + }) + + // If we did not encounter any partials we can return here. + if !havePartial { + return ss.Msgs, ms.state.LastSeq + } + + // If we are here we need to scan the msgs. + // Capture first and last sequences for scan and then clear what we had. + first, last := ss.First, ss.Last + // To track if we decide to exclude we need to calculate first. + if first < sseq { + first = sseq + } + + // Now we want to check if it is better to scan inclusive and recalculate that way + // or leave and scan exclusive and adjust our totals. + // ss.Last is always correct here. + toScan, toExclude := last-first, first-ms.state.FirstSeq+ms.state.LastSeq-ss.Last + var seen map[string]bool + if lastPerSubject { + seen = make(map[string]bool) + } + if toScan < toExclude { + ss.Msgs, ss.First = 0, 0 + + update := func(sm *StoreMsg) { + ss.Msgs++ + if ss.First == 0 { + ss.First = sm.seq + } + if seen != nil { + seen[sm.subj] = true + } + } + // Check if easier to just scan msgs vs the sequence range. + // This can happen with lots of interior deletes. + if last-first > uint64(len(ms.msgs)) { + for _, sm := range ms.msgs { + if sm.seq >= first && sm.seq <= last && !seen[sm.subj] && sl.HasInterest(sm.subj) { + update(sm) + } + } + } else { + for seq := first; seq <= last; seq++ { + if sm, ok := ms.msgs[seq]; ok && !seen[sm.subj] && sl.HasInterest(sm.subj) { + update(sm) + } + } + } + } else { + // We will adjust from the totals above by scanning what we need to exclude. + ss.First = first + ss.Msgs += totalSkipped + var adjust uint64 + var tss *SimpleState + + update := func(sm *StoreMsg) { + if lastPerSubject { + tss, _ = ms.fss.Find(stringToBytes(sm.subj)) + } + // If we are last per subject, make sure to only adjust if all messages are before our first. + if tss == nil || tss.Last < first { + adjust++ + } + if seen != nil { + seen[sm.subj] = true + } + } + // Check if easier to just scan msgs vs the sequence range. + if first-ms.state.FirstSeq > uint64(len(ms.msgs)) { + for _, sm := range ms.msgs { + if sm.seq < first && !seen[sm.subj] && sl.HasInterest(sm.subj) { + update(sm) + } + } + } else { + for seq := ms.state.FirstSeq; seq < first; seq++ { + if sm, ok := ms.msgs[seq]; ok && !seen[sm.subj] && sl.HasInterest(sm.subj) { + update(sm) + } + } + } + // Now do range at end. + for seq := last + 1; seq < ms.state.LastSeq; seq++ { + if sm, ok := ms.msgs[seq]; ok && !seen[sm.subj] && sl.HasInterest(sm.subj) { + adjust++ + if seen != nil { + seen[sm.subj] = true + } + } + } + ss.Msgs -= adjust + } + + return ss.Msgs, ms.state.LastSeq +} + // Will check the msg limit for this tracked subject. // Lock should be held. func (ms *memStore) enforcePerSubjectLimit(subj string, ss *SimpleState) { @@ -875,7 +1031,9 @@ func (ms *memStore) Compact(seq uint64) (uint64, error) { ms.state.FirstSeq = seq ms.state.FirstTime = time.Time{} ms.state.LastSeq = seq - 1 + // Reset msgs and fss. ms.msgs = make(map[uint64]*StoreMsg) + ms.fss = stree.NewSubjectTree[SimpleState]() } ms.mu.Unlock() @@ -1225,6 +1383,9 @@ func (ms *memStore) recalculateFirstForSubj(subj string, startSeq uint64, ss *Si for ; tseq <= ss.Last; tseq++ { if sm := ms.msgs[tseq]; sm != nil && sm.subj == subj { ss.First = tseq + if ss.Msgs == 1 { + ss.Last = tseq + } ss.firstNeedsUpdate = false return } @@ -1488,8 +1649,6 @@ func (o *consumerMemStore) Update(state *ConsumerState) error { pending = make(map[uint64]*Pending, len(state.Pending)) for seq, p := range state.Pending { pending[seq] = &Pending{p.Sequence, p.Timestamp} - } - for seq := range pending { if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream { return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq) } @@ -1504,10 +1663,10 @@ func (o *consumerMemStore) Update(state *ConsumerState) error { // Replace our state. o.mu.Lock() + defer o.mu.Unlock() // Check to see if this is an outdated update. - if state.Delivered.Consumer < o.state.Delivered.Consumer { - o.mu.Unlock() + if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream { return fmt.Errorf("old update ignored") } @@ -1515,7 +1674,6 @@ func (o *consumerMemStore) Update(state *ConsumerState) error { o.state.AckFloor = state.AckFloor o.state.Pending = pending o.state.Redelivered = redelivered - o.mu.Unlock() return nil } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go index 2bd25f9a7b..77a6c1fe71 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go @@ -3228,10 +3228,11 @@ func (s *Server) HandleHealthz(w http.ResponseWriter, r *http.Request) { Details: includeDetails, }) - code := http.StatusOK + code := hs.StatusCode if hs.Error != _EMPTY_ { s.Warnf("Healthcheck failed: %q", hs.Error) - code = hs.StatusCode + } else if len(hs.Errors) != 0 { + s.Warnf("Healthcheck failed: %d errors", len(hs.Errors)) } // Remove StatusCode from JSON representation when responding via HTTP // since this is already in the response. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/opts.go b/vendor/github.com/nats-io/nats-server/v2/server/opts.go index 0b4ed483dc..c73127e530 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/opts.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/opts.go @@ -1,4 +1,4 @@ -// Copyright 2012-2023 The NATS Authors +// Copyright 2012-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -657,26 +657,28 @@ type authorization struct { // TLSConfigOpts holds the parsed tls config information, // used with flag parsing type TLSConfigOpts struct { - CertFile string - KeyFile string - CaFile string - Verify bool - Insecure bool - Map bool - TLSCheckKnownURLs bool - HandshakeFirst bool // Indicate that the TLS handshake should occur first, before sending the INFO protocol. - FallbackDelay time.Duration // Where supported, indicates how long to wait for the handshake before falling back to sending the INFO protocol first. - Timeout float64 - RateLimit int64 - Ciphers []uint16 - CurvePreferences []tls.CurveID - PinnedCerts PinnedCertSet - CertStore certstore.StoreType - CertMatchBy certstore.MatchByType - CertMatch string - OCSPPeerConfig *certidp.OCSPPeerConfig - Certificates []*TLSCertPairOpt - MinVersion uint16 + CertFile string + KeyFile string + CaFile string + Verify bool + Insecure bool + Map bool + TLSCheckKnownURLs bool + HandshakeFirst bool // Indicate that the TLS handshake should occur first, before sending the INFO protocol. + FallbackDelay time.Duration // Where supported, indicates how long to wait for the handshake before falling back to sending the INFO protocol first. + Timeout float64 + RateLimit int64 + Ciphers []uint16 + CurvePreferences []tls.CurveID + PinnedCerts PinnedCertSet + CertStore certstore.StoreType + CertMatchBy certstore.MatchByType + CertMatch string + CertMatchSkipInvalid bool + CaCertsMatch []string + OCSPPeerConfig *certidp.OCSPPeerConfig + Certificates []*TLSCertPairOpt + MinVersion uint16 } // TLSCertPairOpt are the paths to a certificate and private key. @@ -4419,6 +4421,28 @@ func parseTLS(v any, isClientCtx bool) (t *TLSConfigOpts, retErr error) { return nil, &configErr{tk, certstore.ErrBadCertMatchField.Error()} } tc.CertMatch = certMatch + case "ca_certs_match": + rv := []string{} + switch mv := mv.(type) { + case string: + rv = append(rv, mv) + case []string: + rv = append(rv, mv...) + case []interface{}: + for _, t := range mv { + if token, ok := t.(token); ok { + if ts, ok := token.Value().(string); ok { + rv = append(rv, ts) + continue + } else { + return nil, &configErr{tk, fmt.Sprintf("error parsing ca_cert_match: unsupported type %T where string is expected", token)} + } + } else { + return nil, &configErr{tk, fmt.Sprintf("error parsing ca_cert_match: unsupported type %T", t)} + } + } + } + tc.CaCertsMatch = rv case "handshake_first", "first", "immediate": switch mv := mv.(type) { case bool: @@ -4444,6 +4468,12 @@ func parseTLS(v any, isClientCtx bool) (t *TLSConfigOpts, retErr error) { default: return nil, &configErr{tk, fmt.Sprintf("field %q should be a boolean or a string, got %T", mk, mv)} } + case "cert_match_skip_invalid": + certMatchSkipInvalid, ok := mv.(bool) + if !ok { + return nil, &configErr{tk, certstore.ErrBadCertMatchSkipInvalidField.Error()} + } + tc.CertMatchSkipInvalid = certMatchSkipInvalid case "ocsp_peer": switch vv := mv.(type) { case bool: @@ -4819,7 +4849,7 @@ func GenTLSConfig(tc *TLSConfigOpts) (*tls.Config, error) { } config.Certificates = []tls.Certificate{cert} case tc.CertStore != certstore.STOREEMPTY: - err := certstore.TLSConfig(tc.CertStore, tc.CertMatchBy, tc.CertMatch, &config) + err := certstore.TLSConfig(tc.CertStore, tc.CertMatchBy, tc.CertMatch, tc.CaCertsMatch, tc.CertMatchSkipInvalid, &config) if err != nil { return nil, err } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/parser.go b/vendor/github.com/nats-io/nats-server/v2/server/parser.go index 74f55f576d..663a1dc126 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/parser.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/parser.go @@ -788,7 +788,8 @@ func (c *client) parse(buf []byte) error { c.traceInOp("LS-", arg) } } - err = c.processRemoteUnsub(arg) + leafUnsub := c.op == 'L' || c.op == 'l' + err = c.processRemoteUnsub(arg, leafUnsub) case GATEWAY: if trace { c.traceInOp("RS-", arg) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/raft.go b/vendor/github.com/nats-io/nats-server/v2/server/raft.go index cd8d2d1158..64d8e4df3c 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/raft.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/raft.go @@ -74,8 +74,8 @@ type RaftNode interface { QuitC() <-chan struct{} Created() time.Time Stop() + WaitForStop() Delete() - Wipe() } type WAL interface { @@ -127,11 +127,12 @@ func (state RaftState) String() string { type raft struct { sync.RWMutex - created time.Time // Time that the group was created - accName string // Account name of the asset this raft group is for - group string // Raft group - sd string // Store directory - id string // Node ID + created time.Time // Time that the group was created + accName string // Account name of the asset this raft group is for + group string // Raft group + sd string // Store directory + id string // Node ID + wg sync.WaitGroup // Wait for running goroutines to exit on shutdown wal WAL // WAL store (filestore or memstore) wtype StorageType // WAL type, e.g. FileStorage or MemoryStorage @@ -198,15 +199,19 @@ type raft struct { hcommit uint64 // The commit at the time that applies were paused pobserver bool // Whether we were an observer at the time that applies were paused - prop *ipQueue[*Entry] // Proposals - entry *ipQueue[*appendEntry] // Append entries - resp *ipQueue[*appendEntryResponse] // Append entries responses - apply *ipQueue[*CommittedEntry] // Apply queue (committed entries to be passed to upper layer) - reqs *ipQueue[*voteRequest] // Vote requests - votes *ipQueue[*voteResponse] // Vote responses - stepdown *ipQueue[string] // Stepdown requests - leadc chan bool // Leader changes - quit chan struct{} // Raft group shutdown + prop *ipQueue[*proposedEntry] // Proposals + entry *ipQueue[*appendEntry] // Append entries + resp *ipQueue[*appendEntryResponse] // Append entries responses + apply *ipQueue[*CommittedEntry] // Apply queue (committed entries to be passed to upper layer) + reqs *ipQueue[*voteRequest] // Vote requests + votes *ipQueue[*voteResponse] // Vote responses + leadc chan bool // Leader changes + quit chan struct{} // Raft group shutdown +} + +type proposedEntry struct { + *Entry + reply string // Optional, to respond once proposal handled } // cacthupState structure that holds our subscription, and catchup term and index @@ -342,8 +347,8 @@ func (s *Server) bootstrapRaftNode(cfg *RaftConfig, knownPeers []string, allPeer return writePeerState(cfg.Store, &peerState{knownPeers, expected, extUndetermined}) } -// startRaftNode will start the raft node. -func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabels) (RaftNode, error) { +// initRaftNode will initialize the raft node, to be used by startRaftNode or when testing to not run the Go routine. +func (s *Server) initRaftNode(accName string, cfg *RaftConfig, labels pprofLabels) (*raft, error) { if cfg == nil { return nil, errNilCfg } @@ -387,11 +392,10 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe quit: make(chan struct{}), reqs: newIPQueue[*voteRequest](s, qpfx+"vreq"), votes: newIPQueue[*voteResponse](s, qpfx+"vresp"), - prop: newIPQueue[*Entry](s, qpfx+"entry"), + prop: newIPQueue[*proposedEntry](s, qpfx+"entry"), entry: newIPQueue[*appendEntry](s, qpfx+"appendEntry"), resp: newIPQueue[*appendEntryResponse](s, qpfx+"appendEntryResponse"), apply: newIPQueue[*CommittedEntry](s, qpfx+"committedEntry"), - stepdown: newIPQueue[string](s, qpfx+"stepdown"), accName: accName, leadc: make(chan bool, 32), observer: cfg.Observer, @@ -415,20 +419,20 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe n.vote = vote } - // Make sure that the snapshots directory exists. - if err := os.MkdirAll(filepath.Join(n.sd, snapshotsDir), defaultDirPerms); err != nil { - return nil, fmt.Errorf("could not create snapshots directory - %v", err) - } - // Can't recover snapshots if memory based since wal will be reset. // We will inherit from the current leader. if _, ok := n.wal.(*memStore); ok { - os.Remove(filepath.Join(n.sd, snapshotsDir, "*")) + _ = os.RemoveAll(filepath.Join(n.sd, snapshotsDir)) } else { // See if we have any snapshots and if so load and process on startup. n.setupLastSnapshot() } + // Make sure that the snapshots directory exists. + if err := os.MkdirAll(filepath.Join(n.sd, snapshotsDir), defaultDirPerms); err != nil { + return nil, fmt.Errorf("could not create snapshots directory - %v", err) + } + truncateAndErr := func(index uint64) { if err := n.wal.Truncate(index); err != nil { n.setWriteErr(err) @@ -477,11 +481,6 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe } } } - } else if n.pterm == 0 && n.pindex == 0 { - // We have recovered no state, either through our WAL or snapshots, - // so inherit from term from our tav.idx file and pindex from our last sequence. - n.pterm = n.term - n.pindex = state.LastSeq } // Make sure to track ourselves. @@ -499,7 +498,7 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe // If we fail to do this for some reason then this is fatal — we cannot // continue setting up or the Raft node may be partially/totally isolated. if err := n.createInternalSubs(); err != nil { - n.shutdown(false) + n.shutdown() return nil, err } @@ -525,7 +524,18 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe labels["group"] = n.group s.registerRaftNode(n.group, n) + return n, nil +} + +// startRaftNode will start the raft node. +func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabels) (RaftNode, error) { + n, err := s.initRaftNode(accName, cfg, labels) + if err != nil { + return nil, err + } + // Start the run goroutine for the Raft state machine. + n.wg.Add(1) s.startGoRoutine(n.run, labels) return n, nil @@ -578,8 +588,8 @@ func (s *Server) unregisterRaftNode(group string) { // Returns how many Raft nodes are running in this server instance. func (s *Server) numRaftNodes() int { - s.rnMu.Lock() - defer s.rnMu.Unlock() + s.rnMu.RLock() + defer s.rnMu.RUnlock() return len(s.raftNodes) } @@ -706,7 +716,7 @@ func (n *raft) Propose(data []byte) error { if werr := n.werr; werr != nil { return werr } - n.prop.push(newEntry(EntryNormal, data)) + n.prop.push(newProposedEntry(newEntry(EntryNormal, data), _EMPTY_)) return nil } @@ -725,20 +735,21 @@ func (n *raft) ProposeMulti(entries []*Entry) error { return werr } for _, e := range entries { - n.prop.push(e) + n.prop.push(newProposedEntry(e, _EMPTY_)) } return nil } // ForwardProposal will forward the proposal to the leader if known. // If we are the leader this is the same as calling propose. -// FIXME(dlc) - We could have a reply subject and wait for a response -// for retries, but would need to not block and be in separate Go routine. func (n *raft) ForwardProposal(entry []byte) error { if n.Leader() { return n.Propose(entry) } + // TODO: Currently we do not set a reply subject, even though we are + // now capable of responding. Do this once enough time has passed, + // i.e. maybe in 2.12. n.sendRPC(n.psubj, _EMPTY_, entry) return nil } @@ -757,7 +768,7 @@ func (n *raft) ProposeAddPeer(peer string) error { prop := n.prop n.RUnlock() - prop.push(newEntry(EntryAddPeer, []byte(peer))) + prop.push(newProposedEntry(newEntry(EntryAddPeer, []byte(peer)), _EMPTY_)) return nil } @@ -793,7 +804,7 @@ func (n *raft) ProposeRemovePeer(peer string) error { // peer remove and then notifying the rest of the group that the // peer was removed. if isLeader { - prop.push(newEntry(EntryRemovePeer, []byte(peer))) + prop.push(newProposedEntry(newEntry(EntryRemovePeer, []byte(peer)), _EMPTY_)) n.doRemovePeerAsLeader(peer) return nil } @@ -868,7 +879,7 @@ func (n *raft) PauseApply() error { // If we are currently a candidate make sure we step down. if n.State() == Candidate { - n.stepdown.push(noLeader) + n.stepdownLocked(noLeader) } n.debug("Pausing our apply channel") @@ -1026,36 +1037,28 @@ func (n *raft) InstallSnapshot(data []byte) error { // Check that a catchup isn't already taking place. If it is then we won't // allow installing snapshots until it is done. - if len(n.progress) > 0 { + if len(n.progress) > 0 || n.paused { return errCatchupsRunning } if n.applied == 0 { + n.debug("Not snapshotting as there are no applied entries") return errNoSnapAvailable } + term := n.pterm + if ae, _ := n.loadEntry(n.applied); ae != nil { + term = ae.term + } + n.debug("Installing snapshot of %d bytes", len(data)) - var term uint64 - if ae, _ := n.loadEntry(n.applied); ae != nil { - // Use the term from the most recently applied entry if possible. - term = ae.term - } else if ae, _ = n.loadFirstEntry(); ae != nil { - // Otherwise see if we can find the term from the first entry. - term = ae.term - } else { - // Last resort is to use the last pterm that we knew of. - term = n.pterm - } - - snap := &snapshot{ + return n.installSnapshot(&snapshot{ lastTerm: term, lastIndex: n.applied, peerstate: encodePeerState(&peerState{n.peerNames(), n.csz, n.extSt}), data: data, - } - - return n.installSnapshot(snap) + }) } // Install the snapshot. @@ -1065,11 +1068,7 @@ func (n *raft) installSnapshot(snap *snapshot) error { sn := fmt.Sprintf(snapFileT, snap.lastTerm, snap.lastIndex) sfile := filepath.Join(snapDir, sn) - <-dios - err := os.WriteFile(sfile, n.encodeSnapshot(snap), defaultFilePerms) - dios <- struct{}{} - - if err != nil { + if err := writeFileWithSync(sfile, n.encodeSnapshot(snap), defaultFilePerms); err != nil { // We could set write err here, but if this is a temporary situation, too many open files etc. // we want to retry and snapshots are not fatal. return err @@ -1256,6 +1255,21 @@ func (n *raft) Leader() bool { return n.State() == Leader } +// stepdown immediately steps down the Raft node to the +// follower state. This will take the lock itself. +func (n *raft) stepdown(newLeader string) { + n.Lock() + defer n.Unlock() + n.stepdownLocked(newLeader) +} + +// stepdownLocked immediately steps down the Raft node to the +// follower state. This requires the lock is already held. +func (n *raft) stepdownLocked(newLeader string) { + n.debug("Stepping down") + n.switchToFollowerLocked(newLeader) +} + // isCatchingUp returns true if a catchup is currently taking place. func (n *raft) isCatchingUp() bool { n.RLock() @@ -1297,11 +1311,6 @@ func (n *raft) isCurrent(includeForwardProgress bool) bool { return true } - // Check here on catchup status. - if cs := n.catchup; cs != nil && n.pterm >= cs.cterm && n.pindex >= cs.cindex { - n.cancelCatchup() - } - // Check to see that we have heard from the current leader lately. if n.leader != noLeader && n.leader != n.id && n.catchup == nil { okInterval := int64(hbInterval) * 2 @@ -1312,7 +1321,9 @@ func (n *raft) isCurrent(includeForwardProgress bool) bool { } } if cs := n.catchup; cs != nil { + // We're actively catching up, can't mark current even if commit==applied. n.debug("Not current, still catching up pindex=%d, cindex=%d", n.pindex, cs.cindex) + return false } if n.commit == n.applied { @@ -1463,8 +1474,6 @@ func (n *raft) StepDown(preferred ...string) error { n.vote = noVote n.writeTermVote() - stepdown := n.stepdown - prop := n.prop n.Unlock() if len(preferred) > 0 && maybeLeader == noLeader { @@ -1472,15 +1481,18 @@ func (n *raft) StepDown(preferred ...string) error { } // If we have a new leader selected, transfer over to them. + // Send the append entry directly rather than via the proposals queue, + // as we will switch to follower state immediately and will blow away + // the contents of the proposal queue in the process. if maybeLeader != noLeader { - n.debug("Selected %q for new leader", maybeLeader) - prop.push(newEntry(EntryLeaderTransfer, []byte(maybeLeader))) - } else { - // Force us to stepdown here. - n.debug("Stepping down") - stepdown.push(noLeader) + n.debug("Selected %q for new leader, stepping down due to leadership transfer", maybeLeader) + ae := newEntry(EntryLeaderTransfer, []byte(maybeLeader)) + n.sendAppendEntry([]*Entry{ae}) } + // Force us to stepdown here. + n.stepdown(noLeader) + return nil } @@ -1609,95 +1621,35 @@ func (n *raft) Created() time.Time { } func (n *raft) Stop() { - n.shutdown(false) + n.shutdown() +} + +func (n *raft) WaitForStop() { + if n.state.Load() == int32(Closed) { + n.wg.Wait() + } } func (n *raft) Delete() { - n.shutdown(true) -} + n.shutdown() + n.wg.Wait() -func (n *raft) shutdown(shouldDelete bool) { n.Lock() + defer n.Unlock() - // Returned swap value is the previous state. It looks counter-intuitive - // to do this atomic operation with the lock held, but we have to do so in - // order to make sure that switchState() is not already running. If it is - // then it can potentially update the n.state back to a non-closed state, - // allowing shutdown() to be called again. If that happens then the below - // close(n.quit) will panic from trying to close an already-closed channel. - if n.state.Swap(int32(Closed)) == int32(Closed) { - // If we get called again with shouldDelete, in case we were called first with Stop() cleanup - if shouldDelete { - if wal := n.wal; wal != nil { - wal.Delete() - } - os.RemoveAll(n.sd) - } - n.Unlock() - return - } - - close(n.quit) - if c := n.c; c != nil { - var subs []*subscription - c.mu.Lock() - for _, sub := range c.subs { - subs = append(subs, sub) - } - c.mu.Unlock() - for _, sub := range subs { - n.unsubscribe(sub) - } - c.closeConnection(InternalClient) - n.c = nil - } - - s, g, wal := n.s, n.group, n.wal - - // Unregistering ipQueues do not prevent them from push/pop - // just will remove them from the central monitoring map - queues := []interface { - unregister() - drain() - }{n.reqs, n.votes, n.prop, n.entry, n.resp, n.apply, n.stepdown} - for _, q := range queues { - q.drain() - q.unregister() - } - sd := n.sd - n.Unlock() - - s.unregisterRaftNode(g) - - if wal != nil { - if shouldDelete { - wal.Delete() - } else { - wal.Stop() - } - } - - if shouldDelete { - // Delete all our peer state and vote state and any snapshots. - os.RemoveAll(sd) - n.debug("Deleted") - } else { - n.debug("Shutdown") - } -} - -// Wipe will force an on disk state reset and then call Delete(). -// Useful in case we have been stopped before this point. -func (n *raft) Wipe() { - n.RLock() - wal := n.wal - n.RUnlock() - // Delete our underlying storage. - if wal != nil { + if wal := n.wal; wal != nil { wal.Delete() } - // Now call delete. - n.Delete() + os.RemoveAll(n.sd) + n.debug("Deleted") +} + +func (n *raft) shutdown() { + // First call to Stop or Delete should close the quit chan + // to notify the runAs goroutines to stop what they're doing. + if n.state.Swap(int32(Closed)) != int32(Closed) { + close(n.quit) + } } const ( @@ -1818,6 +1770,7 @@ func (n *raft) resetElectWithLock(et time.Duration) { func (n *raft) run() { s := n.s defer s.grWG.Done() + defer n.wg.Done() // We want to wait for some routing to be enabled, so we will wait for // at least a route, leaf or gateway connection to be established before @@ -1850,6 +1803,7 @@ func (n *raft) run() { // Send nil entry to signal the upper layers we are done doing replay/restore. n.apply.push(nil) +runner: for s.isRunning() { switch n.State() { case Follower: @@ -1859,9 +1813,47 @@ func (n *raft) run() { case Leader: n.runAsLeader() case Closed: - return + break runner } } + + // If we've reached this point then we're shutting down, either because + // the server is stopping or because the Raft group is closing/closed. + n.Lock() + defer n.Unlock() + + if c := n.c; c != nil { + var subs []*subscription + c.mu.Lock() + for _, sub := range c.subs { + subs = append(subs, sub) + } + c.mu.Unlock() + for _, sub := range subs { + n.unsubscribe(sub) + } + c.closeConnection(InternalClient) + n.c = nil + } + + // Unregistering ipQueues do not prevent them from push/pop + // just will remove them from the central monitoring map + queues := []interface { + unregister() + drain() + }{n.reqs, n.votes, n.prop, n.entry, n.resp, n.apply} + for _, q := range queues { + q.drain() + q.unregister() + } + + n.s.unregisterRaftNode(n.group) + + if wal := n.wal; wal != nil { + wal.Stop() + } + + n.debug("Shutdown") } func (n *raft) debug(format string, args ...any) { @@ -1947,7 +1939,7 @@ func (n *raft) processAppendEntries() { // runAsFollower is called by run and will block for as long as the node is // running in the follower state. func (n *raft) runAsFollower() { - for { + for n.State() == Follower { elect := n.electTimer() select { @@ -1956,7 +1948,6 @@ func (n *raft) runAsFollower() { n.processAppendEntries() case <-n.s.quitCh: // The server is shutting down. - n.shutdown(false) return case <-n.quit: // The Raft node is shutting down. @@ -1989,22 +1980,17 @@ func (n *raft) runAsFollower() { n.debug("Ignoring old vote response, we have stepped down") n.votes.popOne() case <-n.resp.ch: - // We're receiving append entry responses from the network, probably because - // we have only just stepped down and they were already in flight. Ignore them. - n.resp.popOne() + // Ignore append entry responses received from before the state change. + n.resp.drain() + case <-n.prop.ch: + // Ignore proposals received from before the state change. + n.prop.drain() case <-n.reqs.ch: // We've just received a vote request from the network. // Because of drain() it is possible that we get nil from popOne(). if voteReq, ok := n.reqs.popOne(); ok { n.processVoteRequest(voteReq) } - case <-n.stepdown.ch: - // We've received a stepdown request, start following the new leader if - // we can. - if newLeader, ok := n.stepdown.popOne(); ok { - n.switchToFollower(newLeader) - return - } } } } @@ -2095,6 +2081,26 @@ func (ae *appendEntry) returnToPool() { aePool.Put(ae) } +// Pool for proposedEntry re-use. +var pePool = sync.Pool{ + New: func() any { + return &proposedEntry{} + }, +} + +// Create a new proposedEntry. +func newProposedEntry(entry *Entry, reply string) *proposedEntry { + pe := pePool.Get().(*proposedEntry) + pe.Entry, pe.reply = entry, reply + return pe +} + +// Will return this proosed entry. +func (pe *proposedEntry) returnToPool() { + pe.Entry, pe.reply = nil, _EMPTY_ + pePool.Put(pe) +} + type EntryType uint8 const ( @@ -2304,7 +2310,7 @@ func (n *raft) handleForwardedRemovePeerProposal(sub *subscription, c *client, _ // Need to copy since this is underlying client/route buffer. peer := copyBytes(msg) - prop.push(newEntry(EntryRemovePeer, peer)) + prop.push(newProposedEntry(newEntry(EntryRemovePeer, peer), reply)) } // Called when a peer has forwarded a proposal. @@ -2325,7 +2331,7 @@ func (n *raft) handleForwardedProposal(sub *subscription, c *client, _ *Account, return } - prop.push(newEntry(EntryNormal, msg)) + prop.push(newProposedEntry(newEntry(EntryNormal, msg), reply)) } func (n *raft) runAsLeader() { @@ -2340,7 +2346,7 @@ func (n *raft) runAsLeader() { fsub, err := n.subscribe(psubj, n.handleForwardedProposal) if err != nil { n.warn("Error subscribing to forwarded proposals: %v", err) - n.stepdown.push(noLeader) + n.stepdownLocked(noLeader) n.Unlock() return } @@ -2348,7 +2354,7 @@ func (n *raft) runAsLeader() { if err != nil { n.warn("Error subscribing to forwarded remove peer proposals: %v", err) n.unsubscribe(fsub) - n.stepdown.push(noLeader) + n.stepdownLocked(noLeader) n.Unlock() return } @@ -2374,7 +2380,6 @@ func (n *raft) runAsLeader() { for n.State() == Leader { select { case <-n.s.quitCh: - n.shutdown(false) return case <-n.quit: return @@ -2394,16 +2399,7 @@ func (n *raft) runAsLeader() { if b.Type == EntryRemovePeer { n.doRemovePeerAsLeader(string(b.Data)) } - entries = append(entries, b) - // If this is us sending out a leadership transfer stepdown inline here. - if b.Type == EntryLeaderTransfer { - // Send out what we have and switch to follower. - n.sendAppendEntry(entries) - n.prop.recycle(&es) - n.debug("Stepping down due to leadership transfer") - n.switchToFollower(noLeader) - return - } + entries = append(entries, b.Entry) // Increment size. sz += len(b.Data) + 1 // If below thresholds go ahead and send. @@ -2419,6 +2415,13 @@ func (n *raft) runAsLeader() { if len(entries) > 0 { n.sendAppendEntry(entries) } + // Respond to any proposals waiting for a confirmation. + for _, pe := range es { + if pe.reply != _EMPTY_ { + n.sendReply(pe.reply, nil) + } + pe.returnToPool() + } n.prop.recycle(&es) case <-hb.C: @@ -2427,7 +2430,7 @@ func (n *raft) runAsLeader() { } case <-lq.C: if n.lostQuorum() { - n.switchToFollower(noLeader) + n.stepdown(noLeader) return } case <-n.votes.ch: @@ -2437,7 +2440,7 @@ func (n *raft) runAsLeader() { continue } if vresp.term > n.Term() { - n.switchToFollower(noLeader) + n.stepdown(noLeader) return } n.trackPeer(vresp.peer) @@ -2446,11 +2449,6 @@ func (n *raft) runAsLeader() { if voteReq, ok := n.reqs.popOne(); ok { n.processVoteRequest(voteReq) } - case <-n.stepdown.ch: - if newLeader, ok := n.stepdown.popOne(); ok { - n.switchToFollower(newLeader) - return - } case <-n.entry.ch: n.processAppendEntries() } @@ -2584,7 +2582,6 @@ func (n *raft) runCatchup(ar *appendEntryResponse, indexUpdatesQ *ipQueue[uint64 for n.Leader() { select { case <-n.s.quitCh: - n.shutdown(false) return case <-n.quit: return @@ -2621,7 +2618,7 @@ func (n *raft) sendSnapshotToFollower(subject string) (uint64, error) { snap, err := n.loadLastSnapshot() if err != nil { // We need to stepdown here when this happens. - n.stepdown.push(noLeader) + n.stepdownLocked(noLeader) // We need to reset our state here as well. n.resetWAL() return 0, err @@ -2687,7 +2684,7 @@ func (n *raft) catchupFollower(ar *appendEntryResponse) { n.warn("Request from follower for entry at index [%d] errored for state %+v - %v", start, state, err) if err == ErrStoreEOF { // If we are here we are seeing a request for an item beyond our state, meaning we should stepdown. - n.stepdown.push(noLeader) + n.stepdownLocked(noLeader) n.Unlock() arPool.Put(ar) return @@ -2699,7 +2696,7 @@ func (n *raft) catchupFollower(ar *appendEntryResponse) { // If we are here we are seeing a request for an item we do not have, meaning we should stepdown. // This is possible on a reset of our WAL but the other side has a snapshot already. // If we do not stepdown this can cycle. - n.stepdown.push(noLeader) + n.stepdownLocked(noLeader) n.Unlock() arPool.Put(ar) return @@ -2713,7 +2710,11 @@ func (n *raft) catchupFollower(ar *appendEntryResponse) { n.progress[ar.peer] = indexUpdates n.Unlock() - n.s.startGoRoutine(func() { n.runCatchup(ar, indexUpdates) }) + n.wg.Add(1) + n.s.startGoRoutine(func() { + defer n.wg.Done() + n.runCatchup(ar, indexUpdates) + }) } func (n *raft) loadEntry(index uint64) (*appendEntry, error) { @@ -2752,7 +2753,7 @@ func (n *raft) applyCommit(index uint64) error { if err != ErrStoreClosed && err != ErrStoreEOF { n.warn("Got an error loading %d index: %v - will reset", index, err) if n.State() == Leader { - n.stepdown.push(n.selectNextLeader()) + n.stepdownLocked(n.selectNextLeader()) } // Reset and cancel any catchup. n.resetWAL() @@ -2829,7 +2830,7 @@ func (n *raft) applyCommit(index uint64) error { // If this is us and we are the leader we should attempt to stepdown. if peer == n.id && n.State() == Leader { - n.stepdown.push(n.selectNextLeader()) + n.stepdownLocked(n.selectNextLeader()) } // Remove from string intern map. @@ -2960,16 +2961,18 @@ func (n *raft) runAsCandidate() { n.ID(): {}, } - for { + for n.State() == Candidate { elect := n.electTimer() select { case <-n.entry.ch: n.processAppendEntries() case <-n.resp.ch: - // Ignore - n.resp.popOne() + // Ignore append entry responses received from before the state change. + n.resp.drain() + case <-n.prop.ch: + // Ignore proposals received from before the state change. + n.prop.drain() case <-n.s.quitCh: - n.shutdown(false) return case <-n.quit: return @@ -3003,8 +3006,8 @@ func (n *raft) runAsCandidate() { n.term = vresp.term n.vote = noVote n.writeTermVote() - n.stepdown.push(noLeader) n.lxfer = false + n.stepdownLocked(noLeader) n.Unlock() } case <-n.reqs.ch: @@ -3012,11 +3015,6 @@ func (n *raft) runAsCandidate() { if voteReq, ok := n.reqs.popOne(); ok { n.processVoteRequest(voteReq) } - case <-n.stepdown.ch: - if newLeader, ok := n.stepdown.popOne(); ok { - n.switchToFollower(newLeader) - return - } } } } @@ -3098,7 +3096,7 @@ func (n *raft) truncateWAL(term, index uint64) { defer func() { // Check to see if we invalidated any snapshots that might have held state // from the entries we are truncating. - if snap, _ := n.loadLastSnapshot(); snap != nil && snap.lastIndex >= index { + if snap, _ := n.loadLastSnapshot(); snap != nil && snap.lastIndex > index { os.Remove(n.snapfile) n.snapfile = _EMPTY_ } @@ -3128,7 +3126,7 @@ func (n *raft) truncateWAL(term, index uint64) { } } // Set after we know we have truncated properly. - n.term, n.pterm, n.pindex = term, term, index + n.pterm, n.pindex = term, index } // Reset our WAL. This is equivalent to truncating all data from the log. @@ -3177,7 +3175,7 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { n.writeTermVote() } n.debug("Received append entry from another leader, stepping down to %q", ae.leader) - n.stepdown.push(ae.leader) + n.stepdownLocked(ae.leader) } else { // Let them know we are the leader. ar := newAppendEntryResponse(n.term, n.pindex, n.id, false) @@ -3194,19 +3192,18 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { // another node has taken on the leader role already, so we should convert // to a follower of that node instead. if n.State() == Candidate { - // Ignore old terms, otherwise we might end up stepping down incorrectly. - // Needs to be ahead of our pterm (last log index), as an isolated node - // could have bumped its vote term up considerably past this point. - if ae.term >= n.pterm { + // If we have a leader in the current term or higher, we should stepdown, + // write the term and vote if the term of the request is higher. + if ae.term >= n.term { // If the append entry term is newer than the current term, erase our // vote. if ae.term > n.term { + n.term = ae.term n.vote = noVote + n.writeTermVote() } n.debug("Received append entry in candidate state from %q, converting to follower", ae.leader) - n.term = ae.term - n.writeTermVote() - n.stepdown.push(ae.leader) + n.stepdownLocked(ae.leader) } } @@ -3261,7 +3258,6 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { // If this term is greater than ours. if ae.term > n.term { - n.pterm = ae.pterm n.term = ae.term n.vote = noVote if isNew { @@ -3269,8 +3265,15 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { } if n.State() != Follower { n.debug("Term higher than ours and we are not a follower: %v, stepping down to %q", n.State(), ae.leader) - n.stepdown.push(ae.leader) + n.stepdownLocked(ae.leader) } + } else if ae.term < n.term && !catchingUp && isNew { + n.debug("Rejected AppendEntry from a leader (%s) with term %d which is less than ours", ae.leader, ae.term) + ar := newAppendEntryResponse(n.term, n.pindex, n.id, false) + n.Unlock() + n.sendRPC(ae.reply, _EMPTY_, ar.encode(arbuf)) + arPool.Put(ar) + return } if isNew && n.leader != ae.leader && n.State() == Follower { @@ -3281,29 +3284,46 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { n.updateLeadChange(false) } - if (isNew && ae.pterm != n.pterm) || ae.pindex != n.pindex { + if ae.pterm != n.pterm || ae.pindex != n.pindex { // Check if this is a lower or equal index than what we were expecting. if ae.pindex <= n.pindex { - n.debug("AppendEntry detected pindex less than ours: %d:%d vs %d:%d", ae.pterm, ae.pindex, n.pterm, n.pindex) + n.debug("AppendEntry detected pindex less than/equal to ours: %d:%d vs %d:%d", ae.pterm, ae.pindex, n.pterm, n.pindex) var ar *appendEntryResponse - var success bool - if eae, _ := n.loadEntry(ae.pindex); eae == nil { + + if ae.pindex < n.commit { + // If we have already committed this entry, just mark success. + success = true + } else if eae, _ := n.loadEntry(ae.pindex); eae == nil { // If terms are equal, and we are not catching up, we have simply already processed this message. // So we will ACK back to the leader. This can happen on server restarts based on timings of snapshots. if ae.pterm == n.pterm && !catchingUp { success = true + } else if ae.pindex == n.pindex { + // Check if only our terms do not match here. + // Make sure pterms match and we take on the leader's. + // This prevents constant spinning. + n.truncateWAL(ae.pterm, ae.pindex) } else { n.resetWAL() } + } else if eae.term == ae.pterm { + // If terms match we can delete all entries past this one, and then continue storing the current entry. + n.truncateWAL(ae.pterm, ae.pindex) + // Only continue if truncation was successful, and we ended up such that we can safely continue. + if ae.pterm == n.pterm && ae.pindex == n.pindex { + goto CONTINUE + } } else { - // If terms mismatched, or we got an error loading, delete that entry and all others past it. + // If terms mismatched, delete that entry and all others past it. // Make sure to cancel any catchups in progress. // Truncate will reset our pterm and pindex. Only do so if we have an entry. n.truncateWAL(eae.pterm, eae.pindex) } - // Cancel regardless. - n.cancelCatchup() + // Cancel regardless if unsuccessful. + if !success { + n.cancelCatchup() + } // Create response. ar = newAppendEntryResponse(ae.pterm, ae.pindex, n.id, success) @@ -3326,16 +3346,6 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { return } - // Check if only our terms do not match here. - if ae.pindex == n.pindex { - // Make sure pterms match and we take on the leader's. - // This prevents constant spinning. - n.truncateWAL(ae.pterm, ae.pindex) - n.cancelCatchup() - n.Unlock() - return - } - if ps, err := decodePeerState(ae.entries[1].Data); err == nil { n.processPeerState(ps) // Also need to copy from client's buffer. @@ -3375,23 +3385,19 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { n.apply.push(newCommittedEntry(n.commit, ae.entries[:1])) n.Unlock() return - - } else { - n.debug("AppendEntry did not match %d %d with %d %d", ae.pterm, ae.pindex, n.pterm, n.pindex) - // Reset our term. - n.term = n.pterm - if ae.pindex > n.pindex { - // Setup our state for catching up. - inbox := n.createCatchup(ae) - ar := newAppendEntryResponse(n.pterm, n.pindex, n.id, false) - n.Unlock() - n.sendRPC(ae.reply, inbox, ar.encode(arbuf)) - arPool.Put(ar) - return - } } + + // Setup our state for catching up. + n.debug("AppendEntry did not match %d %d with %d %d", ae.pterm, ae.pindex, n.pterm, n.pindex) + inbox := n.createCatchup(ae) + ar := newAppendEntryResponse(n.pterm, n.pindex, n.id, false) + n.Unlock() + n.sendRPC(ae.reply, inbox, ar.encode(arbuf)) + arPool.Put(ar) + return } +CONTINUE: // Save to our WAL if we have entries. if ae.shouldStore() { // Only store if an original which will have sub != nil @@ -3528,9 +3534,8 @@ func (n *raft) processAppendEntryResponse(ar *appendEntryResponse) { n.term = ar.term n.vote = noVote n.writeTermVote() - n.warn("Detected another leader with higher term, will stepdown and reset") - n.stepdown.push(noLeader) - n.resetWAL() + n.warn("Detected another leader with higher term, will stepdown") + n.stepdownLocked(noLeader) n.Unlock() arPool.Put(ar) } else if ar.reply != _EMPTY_ { @@ -3577,7 +3582,7 @@ func (n *raft) storeToWAL(ae *appendEntry) error { if index := ae.pindex + 1; index != seq { n.warn("Wrong index, ae is %+v, index stored was %d, n.pindex is %d, will reset", ae, seq, n.pindex) if n.State() == Leader { - n.stepdown.push(n.selectNextLeader()) + n.stepdownLocked(n.selectNextLeader()) } // Reset and cancel any catchup. n.resetWAL() @@ -3771,12 +3776,7 @@ func writePeerState(sd string, ps *peerState) error { if _, err := os.Stat(psf); err != nil && !os.IsNotExist(err) { return err } - - <-dios - err := os.WriteFile(psf, encodePeerState(ps), defaultFilePerms) - dios <- struct{}{} - - return err + return writeFileWithSync(psf, encodePeerState(ps), defaultFilePerms) } func readPeerState(sd string) (ps *peerState, err error) { @@ -3800,12 +3800,7 @@ func writeTermVote(sd string, wtv []byte) error { if _, err := os.Stat(psf); err != nil && !os.IsNotExist(err) { return err } - - <-dios - err := os.WriteFile(psf, wtv, defaultFilePerms) - dios <- struct{}{} - - return err + return writeFileWithSync(psf, wtv, defaultFilePerms) } // readTermVote will read the largest term and who we voted from to stable storage. @@ -3977,9 +3972,10 @@ func (n *raft) processVoteRequest(vr *voteRequest) error { if n.State() != Follower { n.debug("Stepping down from %s, detected higher term: %d vs %d", strings.ToLower(n.State().String()), vr.term, n.term) - n.stepdown.push(noLeader) - n.term = vr.term + n.stepdownLocked(noLeader) } + n.cancelCatchup() + n.term = vr.term n.vote = noVote n.writeTermVote() } @@ -4081,20 +4077,26 @@ func (n *raft) updateLeadChange(isLeader bool) { // Lock should be held. func (n *raft) switchState(state RaftState) { +retry: pstate := n.State() if pstate == Closed { return } + // Set our state. If something else has changed our state + // then retry, this will either be a Stop or Delete call. + if !n.state.CompareAndSwap(int32(pstate), int32(state)) { + goto retry + } + // Reset the election timer. n.resetElectionTimeout() - // Set our state. - n.state.Store(int32(state)) if pstate == Leader && state != Leader { n.updateLeadChange(false) - // Drain the response queue. + // Drain the append entry response and proposal queues. n.resp.drain() + n.prop.drain() } else if state == Leader && pstate != Leader { if len(n.pae) > 0 { n.pae = make(map[uint64]*appendEntry) @@ -4111,13 +4113,17 @@ const ( ) func (n *raft) switchToFollower(leader string) { + n.Lock() + defer n.Unlock() + + n.switchToFollowerLocked(leader) +} + +func (n *raft) switchToFollowerLocked(leader string) { if n.State() == Closed { return } - n.Lock() - defer n.Unlock() - n.debug("Switching to follower") n.lxfer = false @@ -4134,7 +4140,9 @@ func (n *raft) switchToCandidate() { defer n.Unlock() // If we are catching up or are in observer mode we can not switch. - if n.observer || n.paused { + // Avoid petitioning to become leader if we're behind on applies. + if n.observer || n.paused || n.applied < n.commit { + n.resetElect(minElectionTimeout / 4) return } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/reload.go b/vendor/github.com/nats-io/nats-server/v2/server/reload.go index 347fcfd8b7..07e5d021ad 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/reload.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/reload.go @@ -2172,15 +2172,22 @@ func (s *Server) reloadClusterPermissions(oldPerms *RoutePermissions) { } deleteRoutedSubs = deleteRoutedSubs[:0] route.mu.Lock() + pa, _, hasSubType := route.getRoutedSubKeyInfo() for key, sub := range route.subs { - if an := strings.Fields(key)[0]; an != accName { - continue + // If this is not a pinned-account route, we need to get the + // account name from the key to see if we collect this sub. + if !pa { + if an := getAccNameFromRoutedSubKey(sub, key, hasSubType); an != accName { + continue + } } // If we can't export, we need to drop the subscriptions that // we have on behalf of this route. + // Need to make a string cast here since canExport call sl.Match() subj := string(sub.subject) if !route.canExport(subj) { - delete(route.subs, string(sub.sid)) + // We can use bytesToString() here. + delete(route.subs, bytesToString(sub.sid)) deleteRoutedSubs = append(deleteRoutedSubs, sub) } } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/route.go b/vendor/github.com/nats-io/nats-server/v2/server/route.go index 0341f79868..0c455547c9 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/route.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/route.go @@ -74,6 +74,7 @@ type route struct { didSolicit bool retry bool lnoc bool + lnocu bool routeType RouteType url *url.URL authRequired bool @@ -112,6 +113,7 @@ type connectInfo struct { Cluster string `json:"cluster"` Dynamic bool `json:"cluster_dynamic,omitempty"` LNOC bool `json:"lnoc,omitempty"` + LNOCU bool `json:"lnocu,omitempty"` // Support for LS- with origin cluster name Gateway string `json:"gateway,omitempty"` } @@ -767,6 +769,7 @@ func (c *client) processRouteInfo(info *Info) { c.route.gatewayURL = info.GatewayURL c.route.remoteName = info.Name c.route.lnoc = info.LNOC + c.route.lnocu = info.LNOCU c.route.jetstream = info.JetStream // When sent through route INFO, if the field is set, it should be of size 1. @@ -1169,6 +1172,36 @@ type asubs struct { subs []*subscription } +// Returns the account name from the subscription's key. +// This is invoked knowing that the key contains an account name, so for a sub +// that is not from a pinned-account route. +// The `keyHasSubType` boolean indicates that the key starts with the indicator +// for leaf or regular routed subscriptions. +func getAccNameFromRoutedSubKey(sub *subscription, key string, keyHasSubType bool) string { + var accIdx int + if keyHasSubType { + // Start after the sub type indicator. + accIdx = 1 + // But if there is an origin, bump its index. + if len(sub.origin) > 0 { + accIdx = 2 + } + } + return strings.Fields(key)[accIdx] +} + +// Returns if the route is dedicated to an account, its name, and a boolean +// that indicates if this route uses the routed subscription indicator at +// the beginning of the subscription key. +// Lock held on entry. +func (c *client) getRoutedSubKeyInfo() (bool, string, bool) { + var accName string + if an := c.route.accName; len(an) > 0 { + accName = string(an) + } + return accName != _EMPTY_, accName, c.route.lnocu +} + // removeRemoteSubs will walk the subs and remove them from the appropriate account. func (c *client) removeRemoteSubs() { // We need to gather these on a per account basis. @@ -1178,14 +1211,18 @@ func (c *client) removeRemoteSubs() { srv := c.srv subs := c.subs c.subs = nil + pa, accountName, hasSubType := c.getRoutedSubKeyInfo() c.mu.Unlock() for key, sub := range subs { c.mu.Lock() sub.max = 0 c.mu.Unlock() - // Grab the account - accountName := strings.Fields(key)[0] + // If not a pinned-account route, we need to find the account + // name from the sub's key. + if !pa { + accountName = getAccNameFromRoutedSubKey(sub, key, hasSubType) + } ase := as[accountName] if ase == nil { if v, ok := srv.accounts.Load(accountName); ok { @@ -1197,10 +1234,14 @@ func (c *client) removeRemoteSubs() { } else { ase.subs = append(ase.subs, sub) } - if srv.gateway.enabled { - srv.gatewayUpdateSubInterest(accountName, sub, -1) + delta := int32(1) + if len(sub.queue) > 0 { + delta = sub.qw } - ase.acc.updateLeafNodes(sub, -1) + if srv.gateway.enabled { + srv.gatewayUpdateSubInterest(accountName, sub, -delta) + } + ase.acc.updateLeafNodes(sub, -delta) } // Now remove the subs by batch for each account sublist. @@ -1217,8 +1258,9 @@ func (c *client) removeRemoteSubs() { // Lock is held on entry func (c *client) removeRemoteSubsForAcc(name string) []*subscription { var subs []*subscription + _, _, hasSubType := c.getRoutedSubKeyInfo() for key, sub := range c.subs { - an := strings.Fields(key)[0] + an := getAccNameFromRoutedSubKey(sub, key, hasSubType) if an == name { sub.max = 0 subs = append(subs, sub) @@ -1228,46 +1270,69 @@ func (c *client) removeRemoteSubsForAcc(name string) []*subscription { return subs } -func (c *client) parseUnsubProto(arg []byte) (string, []byte, []byte, error) { +func (c *client) parseUnsubProto(arg []byte, accInProto, hasOrigin bool) ([]byte, string, []byte, []byte, error) { // Indicate any activity, so pub and sub or unsubs. c.in.subs++ args := splitArg(arg) - var queue []byte - var accountName string - subjIdx := 1 - c.mu.Lock() - if c.kind == ROUTER && c.route != nil { - if accountName = string(c.route.accName); accountName != _EMPTY_ { - subjIdx = 0 - } + var ( + origin []byte + accountName string + queue []byte + subjIdx int + ) + // If `hasOrigin` is true, then it means this is a LS- with origin in proto. + if hasOrigin { + // We would not be here if there was not at least 1 field. + origin = args[0] + subjIdx = 1 + } + // If there is an account in the protocol, bump the subject index. + if accInProto { + subjIdx++ } - c.mu.Unlock() switch len(args) { case subjIdx + 1: case subjIdx + 2: queue = args[subjIdx+1] default: - return _EMPTY_, nil, nil, fmt.Errorf("parse error: '%s'", arg) + return nil, _EMPTY_, nil, nil, fmt.Errorf("parse error: '%s'", arg) } - if accountName == _EMPTY_ { - accountName = string(args[0]) + if accInProto { + // If there is an account in the protocol, it is before the subject. + accountName = string(args[subjIdx-1]) } - return accountName, args[subjIdx], queue, nil + return origin, accountName, args[subjIdx], queue, nil } // Indicates no more interest in the given account/subject for the remote side. -func (c *client) processRemoteUnsub(arg []byte) (err error) { +func (c *client) processRemoteUnsub(arg []byte, leafUnsub bool) (err error) { srv := c.srv if srv == nil { return nil } - accountName, subject, _, err := c.parseUnsubProto(arg) + + var accountName string + // Assume the account will be in the protocol. + accInProto := true + + c.mu.Lock() + originSupport := c.route.lnocu + if c.route != nil && len(c.route.accName) > 0 { + accountName, accInProto = string(c.route.accName), false + } + c.mu.Unlock() + + hasOrigin := leafUnsub && originSupport + _, accNameFromProto, subject, _, err := c.parseUnsubProto(arg, accInProto, hasOrigin) if err != nil { return fmt.Errorf("processRemoteUnsub %s", err.Error()) } + if accInProto { + accountName = accNameFromProto + } // Lookup the account var acc *Account if v, ok := srv.accounts.Load(accountName); ok { @@ -1284,28 +1349,43 @@ func (c *client) processRemoteUnsub(arg []byte) (err error) { } updateGWs := false - // We store local subs by account and subject and optionally queue name. - // RS- will have the arg exactly as the key. + + _keya := [128]byte{} + _key := _keya[:0] + var key string - if c.kind == ROUTER && c.route != nil && len(c.route.accName) > 0 { - key = accountName + " " + bytesToString(arg) - } else { + if !originSupport { + // If it is an LS- or RS-, we use the protocol as-is as the key. key = bytesToString(arg) + } else { + // We need to prefix with the sub type. + if leafUnsub { + _key = append(_key, keyRoutedLeafSubByte) + } else { + _key = append(_key, keyRoutedSubByte) + } + _key = append(_key, ' ') + _key = append(_key, arg...) + key = bytesToString(_key) } + delta := int32(1) sub, ok := c.subs[key] if ok { delete(c.subs, key) acc.sl.Remove(sub) updateGWs = srv.gateway.enabled + if len(sub.queue) > 0 { + delta = sub.qw + } } c.mu.Unlock() if updateGWs { - srv.gatewayUpdateSubInterest(accountName, sub, -1) + srv.gatewayUpdateSubInterest(accountName, sub, -delta) } // Now check on leafnode updates. - acc.updateLeafNodes(sub, -1) + acc.updateLeafNodes(sub, -delta) if c.opts.Verbose { c.sendOK() @@ -1322,35 +1402,78 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { return nil } - // Copy so we do not reference a potentially large buffer - arg := make([]byte, len(argo)) - copy(arg, argo) - - args := splitArg(arg) - sub := &subscription{client: c} - - // This value indicate what is the mandatory subject offset in the args - // slice. It varies based on the optional presence of origin or account name - // fields (tha latter would not be present for "per-account" routes). - var subjIdx int - // If account is present, this is its "char" position in arg slice. - var accPos int - if hasOrigin { - // Set to 1, will be adjusted if the account is also expected. - subjIdx = 1 - sub.origin = args[0] - // The account would start after the origin and trailing space. - accPos = len(sub.origin) + 1 - } + // We copy `argo` to not reference the read buffer. However, we will + // prefix with a code that says if the remote sub is for a leaf + // (hasOrigin == true) or not to prevent key collisions. Imagine: + // "RS+ foo bar baz 1\r\n" => "foo bar baz" (a routed queue sub) + // "LS+ foo bar baz\r\n" => "foo bar baz" (a route leaf sub on "baz", + // for account "bar" with origin "foo"). + // + // The sub.sid/key will be set respectively to "R foo bar baz" and + // "L foo bar baz". + // + // We also no longer add the account if it was not present (due to + // pinned-account route) since there is no need really. + // + // For routes to older server, we will still create the "arg" with + // the above layout, but we will create the sub.sid/key as before, + // that is, not including the origin for LS+ because older server + // only send LS- without origin, so we would not be able to find + // the sub in the map. c.mu.Lock() accountName := string(c.route.accName) + oldStyle := !c.route.lnocu c.mu.Unlock() - // If the route is dedicated to an account, accountName will not - // be empty. If it is, then the account must be in the protocol. - var accInProto bool - if accountName == _EMPTY_ { + + // Indicate if the account name should be in the protocol. It would be the + // case if accountName is empty. + accInProto := accountName == _EMPTY_ + + // Copy so we do not reference a potentially large buffer. + // Add 2 more bytes for the routed sub type. + arg := make([]byte, 0, 2+len(argo)) + if hasOrigin { + arg = append(arg, keyRoutedLeafSubByte) + } else { + arg = append(arg, keyRoutedSubByte) + } + arg = append(arg, ' ') + arg = append(arg, argo...) + + // Now split to get all fields. Unroll splitArgs to avoid runtime/heap issues. + a := [MAX_RSUB_ARGS][]byte{} + args := a[:0] + start := -1 + for i, b := range arg { + switch b { + case ' ', '\t', '\r', '\n': + if start >= 0 { + args = append(args, arg[start:i]) + start = -1 + } + default: + if start < 0 { + start = i + } + } + } + if start >= 0 { + args = append(args, arg[start:]) + } + + delta := int32(1) + sub := &subscription{client: c} + + // There will always be at least a subject, but its location will depend + // on if there is an origin, an account name, etc.. Since we know that + // we have added the sub type indicator as the first field, the subject + // position will be at minimum at index 1. + subjIdx := 1 + if hasOrigin { + subjIdx++ + } + if accInProto { subjIdx++ - accInProto = true } switch len(args) { case subjIdx + 1: @@ -1358,15 +1481,50 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { case subjIdx + 3: sub.queue = args[subjIdx+1] sub.qw = int32(parseSize(args[subjIdx+2])) + // TODO: (ik) We should have a non empty queue name and a queue + // weight >= 1. For 2.11, we may want to return an error if that + // is not the case, but for now just overwrite `delta` if queue + // weight is greater than 1 (it is possible after a reconnect/ + // server restart to receive a queue weight > 1 for a new sub). + if sub.qw > 1 { + delta = sub.qw + } default: return fmt.Errorf("processRemoteSub Parse Error: '%s'", arg) } + // We know that the number of fields is correct. So we can access args[] based + // on where we expect the fields to be. + + // If there is an origin, it will be at index 1. + if hasOrigin { + sub.origin = args[1] + } + // For subject, use subjIdx. sub.subject = args[subjIdx] - // If the account name is empty (not a "per-account" route), the account - // is at the index prior to the subject. - if accountName == _EMPTY_ { + // If the account name is in the protocol, it will be before the subject. + if accInProto { accountName = bytesToString(args[subjIdx-1]) } + // Now set the sub.sid from the arg slice. However, we will have a different + // one if we use the origin or not. + start = 0 + end := len(arg) + if sub.queue != nil { + // Remove the ' ' from the arg length. + end -= 1 + len(args[subjIdx+2]) + } + if oldStyle { + // We will start at the account (if present) or at the subject. + // We first skip the "R " or "L " + start = 2 + // And if there is an origin skip that. + if hasOrigin { + start += len(sub.origin) + 1 + } + // Here we are pointing at the account (if present), or at the subject. + } + sub.sid = arg[start:end] + // Lookup account while avoiding fetch. // A slow fetch delays subsequent remote messages. It also avoids the expired check (see below). // With all but memory resolver lookup can be delayed or fail. @@ -1424,33 +1582,6 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { return nil } - // We store local subs by account and subject and optionally queue name. - // If we have a queue it will have a trailing weight which we do not want. - if sub.queue != nil { - // if the account is in the protocol, we can reference directly "arg", - // otherwise, we need to allocate/construct the sid. - if accInProto { - sub.sid = arg[accPos : accPos+len(accountName)+1+len(sub.subject)+1+len(sub.queue)] - } else { - // It is unfortunate that we have to do this, but the gain of not - // having the account name in message protocols outweight the - // penalty of having to do this here for the processing of a - // subscription. - sub.sid = append(sub.sid, accountName...) - sub.sid = append(sub.sid, ' ') - sub.sid = append(sub.sid, sub.subject...) - sub.sid = append(sub.sid, ' ') - sub.sid = append(sub.sid, sub.queue...) - } - } else if accInProto { - sub.sid = arg[accPos:] - } else { - sub.sid = append(sub.sid, accountName...) - sub.sid = append(sub.sid, ' ') - sub.sid = append(sub.sid, sub.subject...) - } - key := bytesToString(sub.sid) - acc.mu.RLock() // For routes (this can be called by leafnodes), check if the account is // transitioning (from pool to dedicated route) and this route is not a @@ -1465,9 +1596,11 @@ func (c *client) processRemoteSub(argo []byte, hasOrigin bool) (err error) { } sl := acc.sl acc.mu.RUnlock() + + // We use the sub.sid for the key of the c.subs map. + key := bytesToString(sub.sid) osub := c.subs[key] updateGWs := false - delta := int32(1) if osub == nil { c.subs[key] = sub // Now place into the account sl. @@ -1509,10 +1642,14 @@ func (c *client) addRouteSubOrUnsubProtoToBuf(buf []byte, accName string, sub *s if isSubProto { buf = append(buf, lSubBytes...) buf = append(buf, sub.origin...) + buf = append(buf, ' ') } else { buf = append(buf, lUnsubBytes...) + if c.route.lnocu { + buf = append(buf, sub.origin...) + buf = append(buf, ' ') + } } - buf = append(buf, ' ') } else { if isSubProto { buf = append(buf, rSubBytes...) @@ -1613,18 +1750,27 @@ func (s *Server) sendSubsToRoute(route *client, idx int, account string) { for _, a := range accs { a.mu.RLock() for key, n := range a.rm { - var subj, qn []byte - s := strings.Split(key, " ") - subj = []byte(s[0]) - if len(s) > 1 { - qn = []byte(s[1]) + var origin, qn []byte + s := strings.Fields(key) + // Subject will always be the second field (index 1). + subj := stringToBytes(s[1]) + // Check if the key is for a leaf (will be field 0). + forLeaf := s[0] == keyRoutedLeafSub + // For queue, if not for a leaf, we need 3 fields "R foo bar", + // but if for a leaf, we need 4 fields "L foo bar leaf_origin". + if l := len(s); (!forLeaf && l == 3) || (forLeaf && l == 4) { + qn = stringToBytes(s[2]) } - // s[0] is the subject and already as a string, so use that + if forLeaf { + // The leaf origin will be the last field. + origin = stringToBytes(s[len(s)-1]) + } + // s[1] is the subject and already as a string, so use that // instead of converting back `subj` to a string. - if !route.canImport(s[0]) { + if !route.canImport(s[1]) { continue } - sub := subscription{subject: subj, queue: qn, qw: n} + sub := subscription{origin: origin, subject: subj, queue: qn, qw: n} buf = route.addRouteSubOrUnsubProtoToBuf(buf, a.Name, &sub, true) } a.mu.RUnlock() @@ -2286,8 +2432,9 @@ func (s *Server) updateRouteSubscriptionMap(acc *Account, sub *subscription, del return } - // Create the fast key which will use the subject or 'subjectqueue' for queue subscribers. - key := keyFromSub(sub) + // Create the subscription key which will prevent collisions between regular + // and leaf routed subscriptions. See keyFromSubWithOrigin() for details. + key := keyFromSubWithOrigin(sub) // Decide whether we need to send an update out to all the routes. update := isq @@ -2481,6 +2628,7 @@ func (s *Server) startRouteAcceptLoop() { Domain: s.info.Domain, Dynamic: s.isClusterNameDynamic(), LNOC: true, + LNOCU: true, } // For tests that want to simulate old servers, do not set the compression // on the INFO protocol if configured with CompressionNotSupported. @@ -2795,6 +2943,7 @@ func (c *client) processRouteConnect(srv *Server, arg []byte, lang string) error c.mu.Lock() c.route.remoteID = c.opts.Name c.route.lnoc = proto.LNOC + c.route.lnocu = proto.LNOCU c.setRoutePermissions(perms) c.headers = supportsHeaders && proto.Headers c.mu.Unlock() diff --git a/vendor/github.com/nats-io/nats-server/v2/server/sendq.go b/vendor/github.com/nats-io/nats-server/v2/server/sendq.go index 0287c5548a..e567d7aeee 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/sendq.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/sendq.go @@ -56,6 +56,8 @@ func (sq *sendq) internalLoop() { rply [256]byte szb [10]byte hdb [10]byte + _msg [4096]byte + msg = _msg[:0] ) for s.isRunning() { @@ -73,16 +75,18 @@ func (sq *sendq) internalLoop() { } else { c.pa.reply = nil } - var msg []byte + msg = msg[:0] if len(pm.hdr) > 0 { c.pa.hdr = len(pm.hdr) c.pa.hdb = append(hdb[:0], strconv.Itoa(c.pa.hdr)...) - msg = append(pm.hdr, pm.msg...) + msg = append(msg, pm.hdr...) + msg = append(msg, pm.msg...) msg = append(msg, _CRLF_...) } else { c.pa.hdr = -1 c.pa.hdb = nil - msg = append(pm.msg, _CRLF_...) + msg = append(msg, pm.msg...) + msg = append(msg, _CRLF_...) } c.processInboundClientMsg(msg) c.pa.szb = nil @@ -107,16 +111,7 @@ func (sq *sendq) send(subj, rply string, hdr, msg []byte) { } out := outMsgPool.Get().(*outMsg) out.subj, out.rply = subj, rply - out.hdr, out.msg = nil, nil - - // We will copy these for now. - if len(hdr) > 0 { - hdr = copyBytes(hdr) - out.hdr = hdr - } - if len(msg) > 0 { - msg = copyBytes(msg) - out.msg = msg - } + out.hdr = append(out.hdr[:0], hdr...) + out.msg = append(out.msg[:0], msg...) sq.q.push(out) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/server.go b/vendor/github.com/nats-io/nats-server/v2/server/server.go index 099a466ca8..81013d1e1b 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/server.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/server.go @@ -94,6 +94,7 @@ type Info struct { Import *SubjectPermission `json:"import,omitempty"` Export *SubjectPermission `json:"export,omitempty"` LNOC bool `json:"lnoc,omitempty"` + LNOCU bool `json:"lnocu,omitempty"` InfoOnConnect bool `json:"info_on_connect,omitempty"` // When true the server will respond to CONNECT with an INFO ConnectInfo bool `json:"connect_info,omitempty"` // When true this is the server INFO response to CONNECT RoutePoolSize int `json:"route_pool_size,omitempty"` @@ -140,8 +141,10 @@ type Server struct { listenerErr error gacc *Account sys *internal + sysAcc atomic.Pointer[Account] js atomic.Pointer[jetStream] isMetaLeader atomic.Bool + jsClustered atomic.Bool accounts sync.Map tmpAccounts sync.Map // Temporarily stores accounts that are being built activeAccounts int32 @@ -1280,6 +1283,7 @@ func (s *Server) configureAccounts(reloading bool) (map[string]struct{}, error) if err == nil && s.sys != nil && acc != s.sys.account { // sys.account.clients (including internal client)/respmap/etc... are transferred separately s.sys.account = acc + s.sysAcc.Store(acc) } if err != nil { return awcsti, fmt.Errorf("error resolving system account: %v", err) @@ -1635,13 +1639,7 @@ func (s *Server) SetSystemAccount(accName string) error { // SystemAccount returns the system account if set. func (s *Server) SystemAccount() *Account { - var sacc *Account - s.mu.RLock() - if s.sys != nil { - sacc = s.sys.account - } - s.mu.RUnlock() - return sacc + return s.sysAcc.Load() } // GlobalAccount returns the global account. @@ -1713,6 +1711,9 @@ func (s *Server) setSystemAccount(acc *Account) error { s.sys.wg.Add(1) s.mu.Unlock() + // Store in atomic for fast lookup. + s.sysAcc.Store(acc) + // Register with the account. s.sys.client.registerWithAccount(acc) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/store.go b/vendor/github.com/nats-io/nats-server/v2/server/store.go index 661959d172..72e039816e 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/store.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/store.go @@ -101,6 +101,7 @@ type StreamStore interface { SubjectsState(filterSubject string) map[string]SimpleState SubjectsTotals(filterSubject string) map[string]uint64 NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) + NumPendingMulti(sseq uint64, sl *Sublist, lastPerSubject bool) (total, validThrough uint64) State() StreamState FastState(*StreamState) EncodedStreamState(failed uint64) (enc []byte, err error) @@ -291,12 +292,16 @@ type DeleteRange struct { } func (dr *DeleteRange) State() (first, last, num uint64) { - return dr.First, dr.First + dr.Num, dr.Num + deletesAfterFirst := dr.Num + if deletesAfterFirst > 0 { + deletesAfterFirst-- + } + return dr.First, dr.First + deletesAfterFirst, dr.Num } // Range will range over all the deleted sequences represented by this block. func (dr *DeleteRange) Range(f func(uint64) bool) { - for seq := dr.First; seq <= dr.First+dr.Num; seq++ { + for seq := dr.First; seq < dr.First+dr.Num; seq++ { if !f(seq) { return } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stream.go b/vendor/github.com/nats-io/nats-server/v2/server/stream.go index bfc75b3c1c..a3a7c8fdc7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stream.go @@ -1580,8 +1580,8 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account) (StreamConfi // Config returns the stream's configuration. func (mset *stream) config() StreamConfig { - mset.mu.RLock() - defer mset.mu.RUnlock() + mset.cfgMu.RLock() + defer mset.cfgMu.RUnlock() return mset.cfg } @@ -3536,7 +3536,6 @@ func (mset *stream) resetSourceInfo() { } } -// Lock should be held. // This will do a reverse scan on startup or leader election // searching for the starting sequence number. // This can be slow in degenerative cases. @@ -3575,6 +3574,15 @@ func (mset *stream) startingSequenceForSources() { } }() + update := func(iName string, seq uint64) { + // Only update active in case we have older ones in here that got configured out. + if si := mset.sources[iName]; si != nil { + if _, ok := seqs[iName]; !ok { + seqs[iName] = seq + } + } + } + var smv StoreMsg for seq := state.LastSeq; seq >= state.FirstSeq; seq-- { sm, err := mset.store.LoadMsg(seq, &smv) @@ -3586,15 +3594,6 @@ func (mset *stream) startingSequenceForSources() { continue } - var update = func(iName string, seq uint64) { - // Only update active in case we have older ones in here that got configured out. - if si := mset.sources[iName]; si != nil { - if _, ok := seqs[iName]; !ok { - seqs[iName] = seq - } - } - } - streamName, iName, sseq := streamAndSeq(string(ss)) if iName == _EMPTY_ { // Pre-2.10 message header means it's a match for any source using that stream name for _, ssi := range mset.cfg.Sources { @@ -3676,12 +3675,17 @@ func (mset *stream) subscribeToStream() error { } else if len(mset.cfg.Sources) > 0 && mset.sourcesConsumerSetup == nil { // Setup the initial source infos for the sources mset.resetSourceInfo() - // Delay the actual source consumer(s) creation(s) for after a delay - mset.sourcesConsumerSetup = time.AfterFunc(time.Duration(rand.Intn(int(500*time.Millisecond)))+100*time.Millisecond, func() { - mset.mu.Lock() + // Delay the actual source consumer(s) creation(s) for after a delay if a replicated stream. + // If it's an R1, this is done at startup and we will do inline. + if mset.cfg.Replicas == 1 { mset.setupSourceConsumers() - mset.mu.Unlock() - }) + } else { + mset.sourcesConsumerSetup = time.AfterFunc(time.Duration(rand.Intn(int(500*time.Millisecond)))+100*time.Millisecond, func() { + mset.mu.Lock() + mset.setupSourceConsumers() + mset.mu.Unlock() + }) + } } // Check for direct get access. // We spin up followers for clustered streams in monitorStream(). @@ -4676,11 +4680,14 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, // Check for republish. if republish { + const ht = "NATS/1.0\r\nNats-Stream: %s\r\nNats-Subject: %s\r\nNats-Sequence: %d\r\nNats-Time-Stamp: %s\r\nNats-Last-Sequence: %d\r\n\r\n" + const htho = "NATS/1.0\r\nNats-Stream: %s\r\nNats-Subject: %s\r\nNats-Sequence: %d\r\nNats-Time-Stamp: %s\r\nNats-Last-Sequence: %d\r\nNats-Msg-Size: %d\r\n\r\n" + // When adding to existing headers, will use the fmt.Append version so this skips the headers from above. + const hoff = 10 + tsStr := time.Unix(0, ts).UTC().Format(time.RFC3339Nano) var rpMsg []byte if len(hdr) == 0 { - const ht = "NATS/1.0\r\nNats-Stream: %s\r\nNats-Subject: %s\r\nNats-Sequence: %d\r\nNats-Time-Stamp: %s\r\nNats-Last-Sequence: %d\r\n\r\n" - const htho = "NATS/1.0\r\nNats-Stream: %s\r\nNats-Subject: %s\r\nNats-Sequence: %d\r\nNats-Time-Stamp: %s\r\nNats-Last-Sequence: %d\r\nNats-Msg-Size: %d\r\n\r\n" if !thdrsOnly { hdr = fmt.Appendf(nil, ht, name, subject, seq, tsStr, tlseq) rpMsg = copyBytes(msg) @@ -4688,19 +4695,16 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, hdr = fmt.Appendf(nil, htho, name, subject, seq, tsStr, tlseq, len(msg)) } } else { - // Slow path. - hdr = genHeader(hdr, JSStream, name) - hdr = genHeader(hdr, JSSubject, subject) - hdr = genHeader(hdr, JSSequence, strconv.FormatUint(seq, 10)) - hdr = genHeader(hdr, JSTimeStamp, tsStr) - hdr = genHeader(hdr, JSLastSequence, strconv.FormatUint(tlseq, 10)) + // use hdr[:end:end] to make sure as we add we copy the original hdr. + end := len(hdr) - LEN_CR_LF if !thdrsOnly { + hdr = fmt.Appendf(hdr[:end:end], ht[hoff:], name, subject, seq, tsStr, tlseq) rpMsg = copyBytes(msg) } else { - hdr = genHeader(hdr, JSMsgSize, strconv.Itoa(len(msg))) + hdr = fmt.Appendf(hdr[:end:end], htho[hoff:], name, subject, seq, tsStr, tlseq, len(msg)) } } - mset.outq.send(newJSPubMsg(tsubj, _EMPTY_, _EMPTY_, copyBytes(hdr), rpMsg, nil, seq)) + mset.outq.send(newJSPubMsg(tsubj, _EMPTY_, _EMPTY_, hdr, rpMsg, nil, seq)) } // Send response here. @@ -4819,6 +4823,9 @@ func newJSPubMsg(dsubj, subj, reply string, hdr, msg []byte, o *consumer, seq ui if pm != nil { m = pm.(*jsPubMsg) buf = m.buf[:0] + if hdr != nil { + hdr = append(m.hdr[:0], hdr...) + } } else { m = new(jsPubMsg) } @@ -4847,6 +4854,9 @@ func (pm *jsPubMsg) returnToPool() { if len(pm.buf) > 0 { pm.buf = pm.buf[:0] } + if len(pm.hdr) > 0 { + pm.hdr = pm.hdr[:0] + } jsPubMsgPool.Put(pm) } @@ -5178,8 +5188,6 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { n.Delete() sa = mset.sa } else { - // Always attempt snapshot on clean exit. - n.InstallSnapshot(mset.stateSnapshotLocked()) n.Stop() } } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stree/dump.go b/vendor/github.com/nats-io/nats-server/v2/server/stree/dump.go index 60f03e4aad..12c62f3bef 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stree/dump.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stree/dump.go @@ -50,6 +50,7 @@ func (t *SubjectTree[T]) dump(w io.Writer, n node, depth int) { // For individual node/leaf dumps. func (n *leaf[T]) kind() string { return "LEAF" } func (n *node4) kind() string { return "NODE4" } +func (n *node10) kind() string { return "NODE10" } func (n *node16) kind() string { return "NODE16" } func (n *node48) kind() string { return "NODE48" } func (n *node256) kind() string { return "NODE256" } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stree/node10.go b/vendor/github.com/nats-io/nats-server/v2/server/stree/node10.go new file mode 100644 index 0000000000..37cd2cc946 --- /dev/null +++ b/vendor/github.com/nats-io/nats-server/v2/server/stree/node10.go @@ -0,0 +1,106 @@ +// Copyright 2023-2024 The NATS Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stree + +// Node with 10 children +// This node size is for the particular case that a part of the subject is numeric +// in nature, i.e. it only needs to satisfy the range 0-9 without wasting bytes +// Order of struct fields for best memory alignment (as per govet/fieldalignment) +type node10 struct { + child [10]node + meta + key [10]byte +} + +func newNode10(prefix []byte) *node10 { + nn := &node10{} + nn.setPrefix(prefix) + return nn +} + +// Currently we do not keep node10 sorted or use bitfields for traversal so just add to the end. +// TODO(dlc) - We should revisit here with more detailed benchmarks. +func (n *node10) addChild(c byte, nn node) { + if n.size >= 10 { + panic("node10 full!") + } + n.key[n.size] = c + n.child[n.size] = nn + n.size++ +} + +func (n *node10) findChild(c byte) *node { + for i := uint16(0); i < n.size; i++ { + if n.key[i] == c { + return &n.child[i] + } + } + return nil +} + +func (n *node10) isFull() bool { return n.size >= 10 } + +func (n *node10) grow() node { + nn := newNode16(n.prefix) + for i := 0; i < 10; i++ { + nn.addChild(n.key[i], n.child[i]) + } + return nn +} + +// Deletes a child from the node. +func (n *node10) deleteChild(c byte) { + for i, last := uint16(0), n.size-1; i < n.size; i++ { + if n.key[i] == c { + // Unsorted so just swap in last one here, else nil if last. + if i < last { + n.key[i] = n.key[last] + n.child[i] = n.child[last] + n.key[last] = 0 + n.child[last] = nil + } else { + n.key[i] = 0 + n.child[i] = nil + } + n.size-- + return + } + } +} + +// Shrink if needed and return new node, otherwise return nil. +func (n *node10) shrink() node { + if n.size > 4 { + return nil + } + nn := newNode4(nil) + for i := uint16(0); i < n.size; i++ { + nn.addChild(n.key[i], n.child[i]) + } + return nn +} + +// Iterate over all children calling func f. +func (n *node10) iter(f func(node) bool) { + for i := uint16(0); i < n.size; i++ { + if !f(n.child[i]) { + return + } + } +} + +// Return our children as a slice. +func (n *node10) children() []node { + return n.child[:n.size] +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stree/node16.go b/vendor/github.com/nats-io/nats-server/v2/server/stree/node16.go index c0c12aafd5..e2dc97908d 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stree/node16.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stree/node16.go @@ -79,10 +79,10 @@ func (n *node16) deleteChild(c byte) { // Shrink if needed and return new node, otherwise return nil. func (n *node16) shrink() node { - if n.size > 4 { + if n.size > 10 { return nil } - nn := newNode4(nil) + nn := newNode10(nil) for i := uint16(0); i < n.size; i++ { nn.addChild(n.key[i], n.child[i]) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stree/node4.go b/vendor/github.com/nats-io/nats-server/v2/server/stree/node4.go index 6aeb024abf..4eddf11b83 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stree/node4.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stree/node4.go @@ -49,7 +49,7 @@ func (n *node4) findChild(c byte) *node { func (n *node4) isFull() bool { return n.size >= 4 } func (n *node4) grow() node { - nn := newNode16(n.prefix) + nn := newNode10(n.prefix) for i := 0; i < 4; i++ { nn.addChild(n.key[i], n.child[i]) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stree/stree.go b/vendor/github.com/nats-io/nats-server/v2/server/stree/stree.go index a289a62974..828631888f 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stree/stree.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stree/stree.go @@ -283,7 +283,7 @@ func (t *SubjectTree[T]) delete(np *node, subject []byte, si int) (*T, bool) { func (t *SubjectTree[T]) match(n node, parts [][]byte, pre []byte, cb func(subject []byte, val *T)) { // Capture if we are sitting on a terminal fwc. var hasFWC bool - if lp := len(parts); lp > 0 && parts[lp-1][0] == fwc { + if lp := len(parts); lp > 0 && len(parts[lp-1]) > 0 && parts[lp-1][0] == fwc { hasFWC = true } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/sublist.go b/vendor/github.com/nats-io/nats-server/v2/server/sublist.go index 5c1325cc68..b7650ede6f 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/sublist.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/sublist.go @@ -20,6 +20,8 @@ import ( "sync" "sync/atomic" "unicode/utf8" + + "github.com/nats-io/nats-server/v2/server/stree" ) // Sublist is a routing mechanism to handle subject distribution and @@ -1731,3 +1733,44 @@ func getAllNodes(l *level, results *SublistResult) { getAllNodes(n.next, results) } } + +// IntersectStree will match all items in the given subject tree that +// have interest expressed in the given sublist. The callback will only be called +// once for each subject, regardless of overlapping subscriptions in the sublist. +func IntersectStree[T any](st *stree.SubjectTree[T], sl *Sublist, cb func(subj []byte, entry *T)) { + var _subj [255]byte + intersectStree(st, sl.root, _subj[:0], cb) +} + +func intersectStree[T any](st *stree.SubjectTree[T], r *level, subj []byte, cb func(subj []byte, entry *T)) { + if r.numNodes() == 0 { + st.Match(subj, cb) + return + } + nsubj := subj + if len(nsubj) > 0 { + nsubj = append(subj, '.') + } + switch { + case r.fwc != nil: + // We've reached a full wildcard, do a FWC match on the stree at this point + // and don't keep iterating downward. + nsubj := append(nsubj, '>') + st.Match(nsubj, cb) + case r.pwc != nil: + // We've found a partial wildcard. We'll keep iterating downwards, but first + // check whether there's interest at this level (without triggering dupes) and + // match if so. + nsubj := append(nsubj, '*') + if len(r.pwc.psubs)+len(r.pwc.qsubs) > 0 && r.pwc.next != nil && r.pwc.next.numNodes() > 0 { + st.Match(nsubj, cb) + } + intersectStree(st, r.pwc.next, nsubj, cb) + case r.numNodes() > 0: + // Normal node with subject literals, keep iterating. + for t, n := range r.nodes { + nsubj := append(nsubj, t...) + intersectStree(st, n.next, nsubj, cb) + } + } +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/websocket.go b/vendor/github.com/nats-io/nats-server/v2/server/websocket.go index 6fce09dd9f..69e6e1a9a7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/websocket.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/websocket.go @@ -67,7 +67,6 @@ const ( wsCloseStatusProtocolError = 1002 wsCloseStatusUnsupportedData = 1003 wsCloseStatusNoStatusReceived = 1005 - wsCloseStatusAbnormalClosure = 1006 wsCloseStatusInvalidPayloadData = 1007 wsCloseStatusPolicyViolation = 1008 wsCloseStatusMessageTooBig = 1009 @@ -458,9 +457,21 @@ func (c *client) wsHandleControlFrame(r *wsReadInfo, frameType wsOpCode, nc io.R } } } - clm := wsCreateCloseMessage(status, body) + // If the status indicates that nothing was received, then we don't + // send anything back. + // From https://datatracker.ietf.org/doc/html/rfc6455#section-7.4 + // it says that code 1005 is a reserved value and MUST NOT be set as a + // status code in a Close control frame by an endpoint. It is + // designated for use in applications expecting a status code to indicate + // that no status code was actually present. + var clm []byte + if status != wsCloseStatusNoStatusReceived { + clm = wsCreateCloseMessage(status, body) + } c.wsEnqueueControlMessage(wsCloseMessage, clm) - nbPoolPut(clm) // wsEnqueueControlMessage has taken a copy. + if len(clm) > 0 { + nbPoolPut(clm) // wsEnqueueControlMessage has taken a copy. + } // Return io.EOF so that readLoop will close the connection as ClientClosed // after processing pending buffers. return pos, io.EOF @@ -647,10 +658,11 @@ func (c *client) wsEnqueueCloseMessage(reason ClosedState) { status = wsCloseStatusProtocolError case MaxPayloadExceeded: status = wsCloseStatusMessageTooBig - case ServerShutdown: + case WriteError, ReadError, StaleConnection, ServerShutdown: + // We used to have WriteError, ReadError and StaleConnection result in + // code 1006, which the spec says that it must not be used to set the + // status in the close message. So using this one instead. status = wsCloseStatusGoingAway - case WriteError, ReadError, StaleConnection: - status = wsCloseStatusAbnormalClosure default: status = wsCloseStatusInternalSrvError } @@ -1316,7 +1328,19 @@ func (c *client) wsCollapsePtoNB() (net.Buffers, int64) { } var csz int for _, b := range nb { - cp.Write(b) + for len(b) > 0 { + n, err := cp.Write(b) + if err != nil { + if err == io.EOF { + break + } + c.Errorf("Error during compression: %v", err) + c.markConnAsClosed(WriteError) + nbPoolPut(b) + return nil, 0 + } + b = b[n:] + } nbPoolPut(b) // No longer needed as contents written to compressor. } if err := cp.Flush(); err != nil { diff --git a/vendor/github.com/nats-io/nkeys/keypair.go b/vendor/github.com/nats-io/nkeys/keypair.go index 9d05518069..69ebe21f75 100644 --- a/vendor/github.com/nats-io/nkeys/keypair.go +++ b/vendor/github.com/nats-io/nkeys/keypair.go @@ -1,4 +1,4 @@ -// Copyright 2018-2022 The NATS Authors +// Copyright 2018-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -15,10 +15,9 @@ package nkeys import ( "bytes" + "crypto/ed25519" "crypto/rand" "io" - - "golang.org/x/crypto/ed25519" ) // kp is the internal struct for a kepypair using seed. @@ -31,7 +30,7 @@ const seedLen = 32 // CreatePair will create a KeyPair based on the rand entropy and a type/prefix byte. func CreatePair(prefix PrefixByte) (KeyPair, error) { - return CreatePairWithRand(prefix, rand.Reader) + return CreatePairWithRand(prefix, nil) } // CreatePair will create a KeyPair based on the rand reader and a type/prefix byte. rand can be nil. @@ -39,17 +38,12 @@ func CreatePairWithRand(prefix PrefixByte, rr io.Reader) (KeyPair, error) { if prefix == PrefixByteCurve { return CreateCurveKeysWithRand(rr) } - if rr == nil { - rr = rand.Reader - } - var rawSeed [seedLen]byte - - _, err := io.ReadFull(rr, rawSeed[:]) + _, priv, err := ed25519.GenerateKey(rr) if err != nil { return nil, err } - seed, err := EncodeSeed(prefix, rawSeed[:]) + seed, err := EncodeSeed(prefix, priv.Seed()) if err != nil { return nil, err } diff --git a/vendor/github.com/nats-io/nkeys/public.go b/vendor/github.com/nats-io/nkeys/public.go index c3cd21edb7..a6e88c9ba5 100644 --- a/vendor/github.com/nats-io/nkeys/public.go +++ b/vendor/github.com/nats-io/nkeys/public.go @@ -1,4 +1,4 @@ -// Copyright 2018 The NATS Authors +// Copyright 2018-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -14,10 +14,9 @@ package nkeys import ( + "crypto/ed25519" "crypto/rand" "io" - - "golang.org/x/crypto/ed25519" ) // A KeyPair from a public key capable of verifying only. diff --git a/vendor/github.com/nats-io/nkeys/xkeys.go b/vendor/github.com/nats-io/nkeys/xkeys.go index 78f8b99e1d..7951fb713d 100644 --- a/vendor/github.com/nats-io/nkeys/xkeys.go +++ b/vendor/github.com/nats-io/nkeys/xkeys.go @@ -1,4 +1,4 @@ -// Copyright 2022-2023 The NATS Authors +// Copyright 2022-2024 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -15,6 +15,7 @@ package nkeys import ( "bytes" + "crypto/ed25519" "crypto/rand" "encoding/binary" "io" @@ -40,17 +41,18 @@ type ckp struct { // CreateCurveKeys will create a Curve typed KeyPair. func CreateCurveKeys() (KeyPair, error) { - return CreateCurveKeysWithRand(rand.Reader) + return CreateCurveKeysWithRand(nil) } // CreateCurveKeysWithRand will create a Curve typed KeyPair // with specified rand source. func CreateCurveKeysWithRand(rr io.Reader) (KeyPair, error) { var kp ckp - _, err := io.ReadFull(rr, kp.seed[:]) + _, priv, err := ed25519.GenerateKey(rr) if err != nil { return nil, err } + kp.seed = [curveKeyLen]byte(priv.Seed()) return &kp, nil } diff --git a/vendor/github.com/rogpeppe/go-internal/semver/forward.go b/vendor/github.com/rogpeppe/go-internal/semver/forward.go deleted file mode 100644 index ad55780155..0000000000 --- a/vendor/github.com/rogpeppe/go-internal/semver/forward.go +++ /dev/null @@ -1,39 +0,0 @@ -// Package semver is a thin forwarding layer on top of -// [golang.org/x/mod/semver]. See that package for documentation. -// -// Deprecated: use [golang.org/x/mod/semver] instead. -package semver - -import "golang.org/x/mod/semver" - -func IsValid(v string) bool { - return semver.IsValid(v) -} - -func Canonical(v string) string { - return semver.Canonical(v) -} - -func Major(v string) string { - return semver.Major(v) -} - -func MajorMinor(v string) string { - return semver.MajorMinor(v) -} - -func Prerelease(v string) string { - return semver.Prerelease(v) -} - -func Build(v string) string { - return semver.Build(v) -} - -func Compare(v, w string) int { - return semver.Compare(v, w) -} - -func Max(v, w string) string { - return semver.Max(v, w) -} diff --git a/vendor/modules.txt b/vendor/modules.txt index ff2ec133c7..e5cf0484cd 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1006,11 +1006,11 @@ github.com/mschoch/smat # github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 ## explicit github.com/munnerz/goautoneg -# github.com/nats-io/jwt/v2 v2.5.8 -## explicit; go 1.18 +# github.com/nats-io/jwt/v2 v2.7.3 +## explicit; go 1.22 github.com/nats-io/jwt/v2 -# github.com/nats-io/nats-server/v2 v2.10.22 -## explicit; go 1.21.0 +# github.com/nats-io/nats-server/v2 v2.10.24 +## explicit; go 1.22 github.com/nats-io/nats-server/v2/conf github.com/nats-io/nats-server/v2/internal/fastrand github.com/nats-io/nats-server/v2/internal/ldap @@ -1028,7 +1028,7 @@ github.com/nats-io/nats.go github.com/nats-io/nats.go/encoders/builtin github.com/nats-io/nats.go/internal/parser github.com/nats-io/nats.go/util -# github.com/nats-io/nkeys v0.4.7 +# github.com/nats-io/nkeys v0.4.9 ## explicit; go 1.20 github.com/nats-io/nkeys # github.com/nats-io/nuid v1.0.1 @@ -1679,7 +1679,6 @@ github.com/rogpeppe/go-internal/internal/syscall/windows github.com/rogpeppe/go-internal/internal/syscall/windows/sysdll github.com/rogpeppe/go-internal/lockedfile github.com/rogpeppe/go-internal/lockedfile/internal/filelock -github.com/rogpeppe/go-internal/semver # github.com/rs/cors v1.11.1 ## explicit; go 1.13 github.com/rs/cors