diff --git a/go.mod b/go.mod index 839a08a52..a9d2d9748 100644 --- a/go.mod +++ b/go.mod @@ -142,6 +142,7 @@ require ( github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op // indirect github.com/armon/go-radix v1.0.0 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect + github.com/aymerick/douceur v0.2.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bitly/go-simplejson v0.5.0 // indirect github.com/bits-and-blooms/bitset v1.22.0 // indirect @@ -256,6 +257,7 @@ require ( github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/renameio/v2 v2.0.1 // indirect github.com/gookit/goutil v0.7.1 // indirect + github.com/gorilla/css v1.0.1 // indirect github.com/gorilla/handlers v1.5.1 // indirect github.com/gorilla/schema v1.4.1 // indirect github.com/gorilla/websocket v1.5.3 // indirect @@ -301,6 +303,7 @@ require ( github.com/mattn/go-sqlite3 v1.14.33 // indirect github.com/maxymania/go-system v0.0.0-20170110133659-647cc364bf0b // indirect github.com/mendsley/gojwk v0.0.0-20141217222730-4d5ec6e58103 // indirect + github.com/microcosm-cc/bluemonday v1.0.27 // indirect github.com/miekg/dns v1.1.57 // indirect github.com/mileusna/useragent v1.3.5 // indirect github.com/minio/crc64nvme v1.1.1 // indirect diff --git a/go.sum b/go.sum index 95f9ec314..2d79bf545 100644 --- a/go.sum +++ b/go.sum @@ -138,6 +138,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkY github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= github.com/aws/aws-sdk-go v1.37.27/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= +github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= +github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= github.com/bbalet/stopwords v1.0.0 h1:0TnGycCtY0zZi4ltKoOGRFIlZHv0WqpoIGUsObjztfo= github.com/bbalet/stopwords v1.0.0/go.mod h1:sAWrQoDMfqARGIn4s6dp7OW7ISrshUD8IP2q3KoqPjc= github.com/beevik/etree v1.6.0 h1:u8Kwy8pp9D9XeITj2Z0XtA5qqZEmtJtuXZRQi+j03eE= @@ -632,6 +634,8 @@ github.com/gophercloud/gophercloud v0.16.0/go.mod h1:wRtmUelyIIv3CSSDI47aUwbs075 github.com/gophercloud/utils v0.0.0-20210216074907-f6de111f2eae/go.mod h1:wx8HMD8oQD0Ryhz6+6ykq75PJ79iPyEqYHfwZ4l7OsA= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= +github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= +github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= github.com/gorilla/handlers v1.5.1 h1:9lRY6j8DEeeBT10CvO9hGW0gmky0BprnvDI5vfhUHH4= github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv/4g96P1Q= github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= @@ -876,6 +880,8 @@ github.com/mendsley/gojwk v0.0.0-20141217222730-4d5ec6e58103 h1:Z/i1e+gTZrmcGeZy github.com/mendsley/gojwk v0.0.0-20141217222730-4d5ec6e58103/go.mod h1:o9YPB5aGP8ob35Vy6+vyq3P3bWe7NQWzf+JLiXCiMaE= github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= +github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk= +github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/miekg/dns v1.1.40/go.mod h1:KNUDUusw/aVsxyTYZM1oqvCicbwhgbNgztCETuNZ7xM= github.com/miekg/dns v1.1.57 h1:Jzi7ApEIzwEPLHWRcafCN9LZSBbqQpxjt/wpgvg7wcM= diff --git a/services/groupware/pkg/groupware/groupware_api_emails.go b/services/groupware/pkg/groupware/groupware_api_emails.go index 5576885aa..1ab735c2e 100644 --- a/services/groupware/pkg/groupware/groupware_api_emails.go +++ b/services/groupware/pkg/groupware/groupware_api_emails.go @@ -11,6 +11,7 @@ import ( "time" "github.com/go-chi/chi/v5" + "github.com/microcosm-cc/bluemonday" "github.com/rs/zerolog" "github.com/opencloud-eu/opencloud/pkg/jmap" @@ -77,12 +78,12 @@ func (g *Groupware) GetAllEmailsInMailbox(w http.ResponseWriter, r *http.Request logger := log.From(req.logger.With().Str(HeaderSince, log.SafeString(since)).Str(logAccountId, log.SafeString(accountId))) - emails, sessionState, lang, jerr := g.jmap.GetMailboxChanges(accountId, req.session, req.ctx, logger, req.language(), mailboxId, since, true, g.maxBodyValueBytes, maxChanges) + changes, sessionState, lang, jerr := g.jmap.GetMailboxChanges(accountId, req.session, req.ctx, logger, req.language(), mailboxId, since, true, g.maxBodyValueBytes, maxChanges) if jerr != nil { return req.errorResponseFromJmap(jerr) } - return etagResponse(emails, sessionState, emails.State, lang) + return etagResponse(changes, sessionState, changes.State, lang) }) } else { g.respond(w, r, func(req Request) Response { @@ -119,7 +120,15 @@ func (g *Groupware) GetAllEmailsInMailbox(w http.ResponseWriter, r *http.Request return req.errorResponseFromJmap(jerr) } - return etagResponse(emails, sessionState, emails.State, lang) + safe := jmap.Emails{ + Emails: g.sanitizeEmails(emails.Emails), + Total: emails.Total, + Limit: emails.Limit, + Offset: emails.Offset, + State: emails.State, + } + + return etagResponse(safe, sessionState, emails.State, lang) }) } } @@ -147,7 +156,7 @@ func (g *Groupware) GetEmailsById(w http.ResponseWriter, r *http.Request) { if len(emails.Emails) < 1 { return notFoundResponse(sessionState) } else { - return etagResponse(emails.Emails[0], sessionState, emails.State, lang) + return etagResponse(g.sanitizeEmail(emails.Emails[0]), sessionState, emails.State, lang) } } else { logger := log.From(l.Array("ids", log.SafeStringArray(ids))) @@ -158,7 +167,7 @@ func (g *Groupware) GetEmailsById(w http.ResponseWriter, r *http.Request) { if len(emails.Emails) < 1 { return notFoundResponse(sessionState) } else { - return etagResponse(emails.Emails, sessionState, emails.State, lang) + return etagResponse(g.sanitizeEmails(emails.Emails), sessionState, emails.State, lang) } } }) @@ -203,7 +212,7 @@ func (g *Groupware) GetEmailAttachments(w http.ResponseWriter, r *http.Request) if len(emails.Emails) < 1 { return notFoundResponse(sessionState) } - email := emails.Emails[0] + email := g.sanitizeEmail(emails.Emails[0]) return etagResponse(email.Attachments, sessionState, emails.State, lang) }) } else { @@ -229,7 +238,7 @@ func (g *Groupware) GetEmailAttachments(w http.ResponseWriter, r *http.Request) return nil } - email := emails.Emails[0] + email := g.sanitizeEmail(emails.Emails[0]) var attachment *jmap.EmailBodyPart = nil for _, part := range email.Attachments { if attachmentSelector(part) { @@ -302,12 +311,12 @@ func (g *Groupware) getEmailsSince(w http.ResponseWriter, r *http.Request, since logger := log.From(l) - emails, sessionState, lang, jerr := g.jmap.GetEmailsSince(accountId, req.session, req.ctx, logger, req.language(), since, true, g.maxBodyValueBytes, maxChanges) + changes, sessionState, lang, jerr := g.jmap.GetEmailsSince(accountId, req.session, req.ctx, logger, req.language(), since, true, g.maxBodyValueBytes, maxChanges) if jerr != nil { return req.errorResponseFromJmap(jerr) } - return etagResponse(emails, sessionState, emails.State, lang) + return etagResponse(changes, sessionState, changes.State, lang) }) } @@ -518,8 +527,6 @@ func (g *Groupware) searchEmails(w http.ResponseWriter, r *http.Request) { } logger = log.From(logger.With().Str(logAccountId, log.SafeString(accountId))) - g.jmap.QueryEmails([]string{accountId}, filter, req.session, req.ctx, logger, req.language(), offset, limit, fetchBodies, g.maxBodyValueBytes) - resultsByAccount, sessionState, lang, jerr := g.jmap.QueryEmailsWithSnippets([]string{accountId}, filter, req.session, req.ctx, logger, req.language(), offset, limit, fetchBodies, g.maxBodyValueBytes) if jerr != nil { return req.errorResponseFromJmap(jerr) @@ -542,7 +549,7 @@ func (g *Groupware) searchEmails(w http.ResponseWriter, r *http.Request) { } flattened[i] = EmailWithSnippets{ // AccountId: accountId, - Email: result.Email, + Email: g.sanitizeEmail(result.Email), Snippets: snippets, } } @@ -653,7 +660,7 @@ func (g *Groupware) GetEmailsForAllAccounts(w http.ResponseWriter, r *http.Reque }) flattened[i] = EmailWithSnippets{ AccountId: accountId, - Email: result.Email, + Email: g.sanitizeEmail(result.Email), Snippets: snippets, } } @@ -701,7 +708,7 @@ func (g *Groupware) GetEmailsForAllAccounts(w http.ResponseWriter, r *http.Reque i := 0 for _, list := range resultsByAccountId { for _, e := range list.Emails { - flattened[i] = e + flattened[i] = g.sanitizeEmail(e) i++ } } @@ -1196,7 +1203,7 @@ type AboutEmailResponse struct { Language jmap.Language `json:"lang"` } -func relatedEmails(email jmap.Email, beacon time.Time, days uint) jmap.EmailFilterElement { +func relatedEmailsFilter(email jmap.Email, beacon time.Time, days uint) jmap.EmailFilterElement { filters := []jmap.EmailFilterElement{} for _, from := range email.From { if from.Email != "" { @@ -1283,7 +1290,7 @@ func (g *Groupware) RelatedToEmail(w http.ResponseWriter, r *http.Request) { beacon := email.ReceivedAt // TODO configurable: either relative to when the email was received, or relative to now //beacon := time.Now() - filter := relatedEmails(email, beacon, days) + filter := relatedEmailsFilter(email, beacon, days) // bgctx, _ := context.WithTimeout(context.Background(), time.Duration(30)*time.Second) // TODO configurable bgctx := context.Background() @@ -1298,7 +1305,7 @@ func (g *Groupware) RelatedToEmail(w http.ResponseWriter, r *http.Request) { l.Error().Err(jerr).Msgf("failed to query %v emails", RelationTypeSameSender) } else { req.observe(g.metrics.EmailSameSenderDuration.WithLabelValues(req.session.JmapEndpoint), duration.Seconds()) - related := filterEmails(results.Emails, email) + related := g.sanitizeEmails(filterEmails(results.Emails, email)) l.Trace().Msgf("'%v' found %v other emails", RelationTypeSameSender, len(related)) if len(related) > 0 { req.push(RelationEntityEmail, AboutEmailsEvent{Id: reqId, Emails: related, Source: RelationTypeSameSender, Language: lang}) @@ -1316,7 +1323,7 @@ func (g *Groupware) RelatedToEmail(w http.ResponseWriter, r *http.Request) { l.Error().Err(jerr).Msgf("failed to list %v emails", RelationTypeSameThread) } else { req.observe(g.metrics.EmailSameThreadDuration.WithLabelValues(req.session.JmapEndpoint), duration.Seconds()) - related := filterEmails(emails, email) + related := g.sanitizeEmails(filterEmails(emails, email)) l.Trace().Msgf("'%v' found %v other emails", RelationTypeSameThread, len(related)) if len(related) > 0 { req.push(RelationEntityEmail, AboutEmailsEvent{Id: reqId, Emails: related, Source: RelationTypeSameThread, Language: lang}) @@ -1325,7 +1332,7 @@ func (g *Groupware) RelatedToEmail(w http.ResponseWriter, r *http.Request) { }) return etagResponse(AboutEmailResponse{ - Email: email, + Email: g.sanitizeEmail(email), RequestId: reqId, }, sessionState, emails.State, lang) }) @@ -1703,3 +1710,45 @@ func squashQueryState[V any](all map[string]V, mapper func(V) jmap.State) jmap.S } return jmap.State(strings.Join(parts, ",")) } + +var sanitizationPolicy *bluemonday.Policy = bluemonday.UGCPolicy() + +func (g *Groupware) sanitizeEmail(source jmap.Email) jmap.Email { + if !g.sanitize { + return source + } + memory := map[string]int{} + for _, ref := range []*[]jmap.EmailBodyPart{&source.HtmlBody, &source.TextBody} { + newBody := make([]jmap.EmailBodyPart, len(*ref)) + for i, p := range *ref { + if p.Type == "text/html" { + if already, done := memory[p.PartId]; !done { + if part, ok := source.BodyValues[p.PartId]; ok { + safe := sanitizationPolicy.Sanitize(part.Value) + part.Value = safe + source.BodyValues[p.PartId] = part + newLen := len(safe) + memory[p.PartId] = newLen + p.Size = newLen + } + } else { + p.Size = already + } + } + newBody[i] = p + } + *ref = newBody + } + return source +} + +func (g *Groupware) sanitizeEmails(source []jmap.Email) []jmap.Email { + if !g.sanitize { + return source + } + result := make([]jmap.Email, len(source)) + for i, email := range source { + result[i] = g.sanitizeEmail(email) + } + return result +} diff --git a/services/groupware/pkg/groupware/groupware_framework.go b/services/groupware/pkg/groupware/groupware_framework.go index 33613521e..33334257d 100644 --- a/services/groupware/pkg/groupware/groupware_framework.go +++ b/services/groupware/pkg/groupware/groupware_framework.go @@ -90,6 +90,7 @@ type Groupware struct { logger *log.Logger defaultEmailLimit uint maxBodyValueBytes uint + sanitize bool // Caches successful and failed Sessions by the username. sessionCache sessionCache jmap *jmap.Client @@ -192,6 +193,8 @@ func NewGroupware(config *config.Config, logger *log.Logger, mux *chi.Mux, prome insecureTls := true // TODO make configurable + sanitize := true // TODO make configurable + m := metrics.New(prometheusRegistry, logger) // TODO add timeouts and other meaningful configuration settings for the HTTP client @@ -339,6 +342,7 @@ func NewGroupware(config *config.Config, logger *log.Logger, mux *chi.Mux, prome jmap: &jmapClient, defaultEmailLimit: defaultEmailLimit, maxBodyValueBytes: maxBodyValueBytes, + sanitize: sanitize, eventChannel: eventChannel, jobsChannel: jobsChannel, jobCounter: atomic.Uint64{}, diff --git a/services/groupware/pkg/groupware/groupware_test.go b/services/groupware/pkg/groupware/groupware_test.go new file mode 100644 index 000000000..1017a1af3 --- /dev/null +++ b/services/groupware/pkg/groupware/groupware_test.go @@ -0,0 +1,34 @@ +package groupware + +import ( + "testing" + + "github.com/opencloud-eu/opencloud/pkg/jmap" + "github.com/stretchr/testify/require" +) + +func TestSanitizeEmail(t *testing.T) { + email := jmap.Email{ + Subject: "test", + BodyValues: map[string]jmap.EmailBodyValue{ + "koze92I1": { + Value: `Google`, + }, + }, + HtmlBody: []jmap.EmailBodyPart{ + { + PartId: "koze92I1", + Type: "text/html", + Size: 65, + }, + }, + } + + g := &Groupware{sanitize: true} + + safe := g.sanitizeEmail(email) + + require := require.New(t) + require.Equal(`Google`, safe.BodyValues["koze92I1"].Value) + require.Equal(57, safe.HtmlBody[0].Size) +} diff --git a/vendor/github.com/aymerick/douceur/LICENSE b/vendor/github.com/aymerick/douceur/LICENSE new file mode 100644 index 000000000..6ce87cd37 --- /dev/null +++ b/vendor/github.com/aymerick/douceur/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Aymerick JEHANNE + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/aymerick/douceur/css/declaration.go b/vendor/github.com/aymerick/douceur/css/declaration.go new file mode 100644 index 000000000..61d29d335 --- /dev/null +++ b/vendor/github.com/aymerick/douceur/css/declaration.go @@ -0,0 +1,60 @@ +package css + +import "fmt" + +// Declaration represents a parsed style property +type Declaration struct { + Property string + Value string + Important bool +} + +// NewDeclaration instanciates a new Declaration +func NewDeclaration() *Declaration { + return &Declaration{} +} + +// Returns string representation of the Declaration +func (decl *Declaration) String() string { + return decl.StringWithImportant(true) +} + +// StringWithImportant returns string representation with optional !important part +func (decl *Declaration) StringWithImportant(option bool) string { + result := fmt.Sprintf("%s: %s", decl.Property, decl.Value) + + if option && decl.Important { + result += " !important" + } + + result += ";" + + return result +} + +// Equal returns true if both Declarations are equals +func (decl *Declaration) Equal(other *Declaration) bool { + return (decl.Property == other.Property) && (decl.Value == other.Value) && (decl.Important == other.Important) +} + +// +// DeclarationsByProperty +// + +// DeclarationsByProperty represents sortable style declarations +type DeclarationsByProperty []*Declaration + +// Implements sort.Interface +func (declarations DeclarationsByProperty) Len() int { + return len(declarations) +} + +// Implements sort.Interface +func (declarations DeclarationsByProperty) Swap(i, j int) { + declarations[i], declarations[j] = declarations[j], declarations[i] +} + +// Implements sort.Interface +func (declarations DeclarationsByProperty) Less(i, j int) bool { + return declarations[i].Property < declarations[j].Property +} diff --git a/vendor/github.com/aymerick/douceur/css/rule.go b/vendor/github.com/aymerick/douceur/css/rule.go new file mode 100644 index 000000000..b5a44b542 --- /dev/null +++ b/vendor/github.com/aymerick/douceur/css/rule.go @@ -0,0 +1,230 @@ +package css + +import ( + "fmt" + "strings" +) + +const ( + indentSpace = 2 +) + +// RuleKind represents a Rule kind +type RuleKind int + +// Rule kinds +const ( + QualifiedRule RuleKind = iota + AtRule +) + +// At Rules than have Rules inside their block instead of Declarations +var atRulesWithRulesBlock = []string{ + "@document", "@font-feature-values", "@keyframes", "@media", "@supports", +} + +// Rule represents a parsed CSS rule +type Rule struct { + Kind RuleKind + + // At Rule name (eg: "@media") + Name string + + // Raw prelude + Prelude string + + // Qualified Rule selectors parsed from prelude + Selectors []string + + // Style properties + Declarations []*Declaration + + // At Rule embedded rules + Rules []*Rule + + // Current rule embedding level + EmbedLevel int +} + +// NewRule instanciates a new Rule +func NewRule(kind RuleKind) *Rule { + return &Rule{ + Kind: kind, + } +} + +// Returns string representation of rule kind +func (kind RuleKind) String() string { + switch kind { + case QualifiedRule: + return "Qualified Rule" + case AtRule: + return "At Rule" + default: + return "WAT" + } +} + +// EmbedsRules returns true if this rule embeds another rules +func (rule *Rule) EmbedsRules() bool { + if rule.Kind == AtRule { + for _, atRuleName := range atRulesWithRulesBlock { + if rule.Name == atRuleName { + return true + } + } + } + + return false +} + +// Equal returns true if both rules are equals +func (rule *Rule) Equal(other *Rule) bool { + if (rule.Kind != other.Kind) || + (rule.Prelude != other.Prelude) || + (rule.Name != other.Name) { + return false + } + + if (len(rule.Selectors) != len(other.Selectors)) || + (len(rule.Declarations) != len(other.Declarations)) || + (len(rule.Rules) != len(other.Rules)) { + return false + } + + for i, sel := range rule.Selectors { + if sel != other.Selectors[i] { + return false + } + } + + for i, decl := range rule.Declarations { + if !decl.Equal(other.Declarations[i]) { + return false + } + } + + for i, rule := range rule.Rules { + if !rule.Equal(other.Rules[i]) { + return false + } + } + + return true +} + +// Diff returns a string representation of rules differences +func (rule *Rule) Diff(other *Rule) []string { + result := []string{} + + if rule.Kind != other.Kind { + result = append(result, fmt.Sprintf("Kind: %s | %s", rule.Kind.String(), other.Kind.String())) + } + + if rule.Prelude != other.Prelude { + result = append(result, fmt.Sprintf("Prelude: \"%s\" | \"%s\"", rule.Prelude, other.Prelude)) + } + + if rule.Name != other.Name { + result = append(result, fmt.Sprintf("Name: \"%s\" | \"%s\"", rule.Name, other.Name)) + } + + if len(rule.Selectors) != len(other.Selectors) { + result = append(result, fmt.Sprintf("Selectors: %v | %v", strings.Join(rule.Selectors, ", "), strings.Join(other.Selectors, ", "))) + } else { + for i, sel := range rule.Selectors { + if sel != other.Selectors[i] { + result = append(result, fmt.Sprintf("Selector: \"%s\" | \"%s\"", sel, other.Selectors[i])) + } + } + } + + if len(rule.Declarations) != len(other.Declarations) { + result = append(result, fmt.Sprintf("Declarations Nb: %d | %d", len(rule.Declarations), len(other.Declarations))) + } else { + for i, decl := range rule.Declarations { + if !decl.Equal(other.Declarations[i]) { + result = append(result, fmt.Sprintf("Declaration: \"%s\" | \"%s\"", decl.String(), other.Declarations[i].String())) + } + } + } + + if len(rule.Rules) != len(other.Rules) { + result = append(result, fmt.Sprintf("Rules Nb: %d | %d", len(rule.Rules), len(other.Rules))) + } else { + + for i, rule := range rule.Rules { + if !rule.Equal(other.Rules[i]) { + result = append(result, fmt.Sprintf("Rule: \"%s\" | \"%s\"", rule.String(), other.Rules[i].String())) + } + } + } + + return result +} + +// Returns the string representation of a rule +func (rule *Rule) String() string { + result := "" + + if rule.Kind == QualifiedRule { + for i, sel := range rule.Selectors { + if i != 0 { + result += ", " + } + result += sel + } + } else { + // AtRule + result += fmt.Sprintf("%s", rule.Name) + + if rule.Prelude != "" { + if result != "" { + result += " " + } + result += fmt.Sprintf("%s", rule.Prelude) + } + } + + if (len(rule.Declarations) == 0) && (len(rule.Rules) == 0) { + result += ";" + } else { + result += " {\n" + + if rule.EmbedsRules() { + for _, subRule := range rule.Rules { + result += fmt.Sprintf("%s%s\n", rule.indent(), subRule.String()) + } + } else { + for _, decl := range rule.Declarations { + result += fmt.Sprintf("%s%s\n", rule.indent(), decl.String()) + } + } + + result += fmt.Sprintf("%s}", rule.indentEndBlock()) + } + + return result +} + +// Returns identation spaces for declarations and rules +func (rule *Rule) indent() string { + result := "" + + for i := 0; i < ((rule.EmbedLevel + 1) * indentSpace); i++ { + result += " " + } + + return result +} + +// Returns identation spaces for end of block character +func (rule *Rule) indentEndBlock() string { + result := "" + + for i := 0; i < (rule.EmbedLevel * indentSpace); i++ { + result += " " + } + + return result +} diff --git a/vendor/github.com/aymerick/douceur/css/stylesheet.go b/vendor/github.com/aymerick/douceur/css/stylesheet.go new file mode 100644 index 000000000..6b32c2ec9 --- /dev/null +++ b/vendor/github.com/aymerick/douceur/css/stylesheet.go @@ -0,0 +1,25 @@ +package css + +// Stylesheet represents a parsed stylesheet +type Stylesheet struct { + Rules []*Rule +} + +// NewStylesheet instanciate a new Stylesheet +func NewStylesheet() *Stylesheet { + return &Stylesheet{} +} + +// Returns string representation of the Stylesheet +func (sheet *Stylesheet) String() string { + result := "" + + for _, rule := range sheet.Rules { + if result != "" { + result += "\n" + } + result += rule.String() + } + + return result +} diff --git a/vendor/github.com/aymerick/douceur/parser/parser.go b/vendor/github.com/aymerick/douceur/parser/parser.go new file mode 100644 index 000000000..6c4917ccf --- /dev/null +++ b/vendor/github.com/aymerick/douceur/parser/parser.go @@ -0,0 +1,409 @@ +package parser + +import ( + "errors" + "fmt" + "regexp" + "strings" + + "github.com/gorilla/css/scanner" + + "github.com/aymerick/douceur/css" +) + +const ( + importantSuffixRegexp = `(?i)\s*!important\s*$` +) + +var ( + importantRegexp *regexp.Regexp +) + +// Parser represents a CSS parser +type Parser struct { + scan *scanner.Scanner // Tokenizer + + // Tokens parsed but not consumed yet + tokens []*scanner.Token + + // Rule embedding level + embedLevel int +} + +func init() { + importantRegexp = regexp.MustCompile(importantSuffixRegexp) +} + +// NewParser instanciates a new parser +func NewParser(txt string) *Parser { + return &Parser{ + scan: scanner.New(txt), + } +} + +// Parse parses a whole stylesheet +func Parse(text string) (*css.Stylesheet, error) { + result, err := NewParser(text).ParseStylesheet() + if err != nil { + return nil, err + } + + return result, nil +} + +// ParseDeclarations parses CSS declarations +func ParseDeclarations(text string) ([]*css.Declaration, error) { + result, err := NewParser(text).ParseDeclarations() + if err != nil { + return nil, err + } + + return result, nil +} + +// ParseStylesheet parses a stylesheet +func (parser *Parser) ParseStylesheet() (*css.Stylesheet, error) { + result := css.NewStylesheet() + + // Parse BOM + if _, err := parser.parseBOM(); err != nil { + return result, err + } + + // Parse list of rules + rules, err := parser.ParseRules() + if err != nil { + return result, err + } + + result.Rules = rules + + return result, nil +} + +// ParseRules parses a list of rules +func (parser *Parser) ParseRules() ([]*css.Rule, error) { + result := []*css.Rule{} + + inBlock := false + if parser.tokenChar("{") { + // parsing a block of rules + inBlock = true + parser.embedLevel++ + + parser.shiftToken() + } + + for parser.tokenParsable() { + if parser.tokenIgnorable() { + parser.shiftToken() + } else if parser.tokenChar("}") { + if !inBlock { + errMsg := fmt.Sprintf("Unexpected } character: %s", parser.nextToken().String()) + return result, errors.New(errMsg) + } + + parser.shiftToken() + parser.embedLevel-- + + // finished + break + } else { + rule, err := parser.ParseRule() + if err != nil { + return result, err + } + + rule.EmbedLevel = parser.embedLevel + result = append(result, rule) + } + } + + return result, parser.err() +} + +// ParseRule parses a rule +func (parser *Parser) ParseRule() (*css.Rule, error) { + if parser.tokenAtKeyword() { + return parser.parseAtRule() + } + + return parser.parseQualifiedRule() +} + +// ParseDeclarations parses a list of declarations +func (parser *Parser) ParseDeclarations() ([]*css.Declaration, error) { + result := []*css.Declaration{} + + if parser.tokenChar("{") { + parser.shiftToken() + } + + for parser.tokenParsable() { + if parser.tokenIgnorable() { + parser.shiftToken() + } else if parser.tokenChar("}") { + // end of block + parser.shiftToken() + break + } else { + declaration, err := parser.ParseDeclaration() + if err != nil { + return result, err + } + + result = append(result, declaration) + } + } + + return result, parser.err() +} + +// ParseDeclaration parses a declaration +func (parser *Parser) ParseDeclaration() (*css.Declaration, error) { + result := css.NewDeclaration() + curValue := "" + + for parser.tokenParsable() { + if parser.tokenChar(":") { + result.Property = strings.TrimSpace(curValue) + curValue = "" + + parser.shiftToken() + } else if parser.tokenChar(";") || parser.tokenChar("}") { + if result.Property == "" { + errMsg := fmt.Sprintf("Unexpected ; character: %s", parser.nextToken().String()) + return result, errors.New(errMsg) + } + + if importantRegexp.MatchString(curValue) { + result.Important = true + curValue = importantRegexp.ReplaceAllString(curValue, "") + } + + result.Value = strings.TrimSpace(curValue) + + if parser.tokenChar(";") { + parser.shiftToken() + } + + // finished + break + } else { + token := parser.shiftToken() + curValue += token.Value + } + } + + // log.Printf("[parsed] Declaration: %s", result.String()) + + return result, parser.err() +} + +// Parse an At Rule +func (parser *Parser) parseAtRule() (*css.Rule, error) { + // parse rule name (eg: "@import") + token := parser.shiftToken() + + result := css.NewRule(css.AtRule) + result.Name = token.Value + + for parser.tokenParsable() { + if parser.tokenChar(";") { + parser.shiftToken() + + // finished + break + } else if parser.tokenChar("{") { + if result.EmbedsRules() { + // parse rules block + rules, err := parser.ParseRules() + if err != nil { + return result, err + } + + result.Rules = rules + } else { + // parse declarations block + declarations, err := parser.ParseDeclarations() + if err != nil { + return result, err + } + + result.Declarations = declarations + } + + // finished + break + } else { + // parse prelude + prelude, err := parser.parsePrelude() + if err != nil { + return result, err + } + + result.Prelude = prelude + } + } + + // log.Printf("[parsed] Rule: %s", result.String()) + + return result, parser.err() +} + +// Parse a Qualified Rule +func (parser *Parser) parseQualifiedRule() (*css.Rule, error) { + result := css.NewRule(css.QualifiedRule) + + for parser.tokenParsable() { + if parser.tokenChar("{") { + if result.Prelude == "" { + errMsg := fmt.Sprintf("Unexpected { character: %s", parser.nextToken().String()) + return result, errors.New(errMsg) + } + + // parse declarations block + declarations, err := parser.ParseDeclarations() + if err != nil { + return result, err + } + + result.Declarations = declarations + + // finished + break + } else { + // parse prelude + prelude, err := parser.parsePrelude() + if err != nil { + return result, err + } + + result.Prelude = prelude + } + } + + result.Selectors = strings.Split(result.Prelude, ",") + for i, sel := range result.Selectors { + result.Selectors[i] = strings.TrimSpace(sel) + } + + // log.Printf("[parsed] Rule: %s", result.String()) + + return result, parser.err() +} + +// Parse Rule prelude +func (parser *Parser) parsePrelude() (string, error) { + result := "" + + for parser.tokenParsable() && !parser.tokenEndOfPrelude() { + token := parser.shiftToken() + result += token.Value + } + + result = strings.TrimSpace(result) + + // log.Printf("[parsed] prelude: %s", result) + + return result, parser.err() +} + +// Parse BOM +func (parser *Parser) parseBOM() (bool, error) { + if parser.nextToken().Type == scanner.TokenBOM { + parser.shiftToken() + return true, nil + } + + return false, parser.err() +} + +// Returns next token without removing it from tokens buffer +func (parser *Parser) nextToken() *scanner.Token { + if len(parser.tokens) == 0 { + // fetch next token + nextToken := parser.scan.Next() + + // log.Printf("[token] %s => %v", nextToken.Type.String(), nextToken.Value) + + // queue it + parser.tokens = append(parser.tokens, nextToken) + } + + return parser.tokens[0] +} + +// Returns next token and remove it from the tokens buffer +func (parser *Parser) shiftToken() *scanner.Token { + var result *scanner.Token + + result, parser.tokens = parser.tokens[0], parser.tokens[1:] + return result +} + +// Returns tokenizer error, or nil if no error +func (parser *Parser) err() error { + if parser.tokenError() { + token := parser.nextToken() + return fmt.Errorf("Tokenizer error: %s", token.String()) + } + + return nil +} + +// Returns true if next token is Error +func (parser *Parser) tokenError() bool { + return parser.nextToken().Type == scanner.TokenError +} + +// Returns true if next token is EOF +func (parser *Parser) tokenEOF() bool { + return parser.nextToken().Type == scanner.TokenEOF +} + +// Returns true if next token is a whitespace +func (parser *Parser) tokenWS() bool { + return parser.nextToken().Type == scanner.TokenS +} + +// Returns true if next token is a comment +func (parser *Parser) tokenComment() bool { + return parser.nextToken().Type == scanner.TokenComment +} + +// Returns true if next token is a CDO or a CDC +func (parser *Parser) tokenCDOorCDC() bool { + switch parser.nextToken().Type { + case scanner.TokenCDO, scanner.TokenCDC: + return true + default: + return false + } +} + +// Returns true if next token is ignorable +func (parser *Parser) tokenIgnorable() bool { + return parser.tokenWS() || parser.tokenComment() || parser.tokenCDOorCDC() +} + +// Returns true if next token is parsable +func (parser *Parser) tokenParsable() bool { + return !parser.tokenEOF() && !parser.tokenError() +} + +// Returns true if next token is an At Rule keyword +func (parser *Parser) tokenAtKeyword() bool { + return parser.nextToken().Type == scanner.TokenAtKeyword +} + +// Returns true if next token is given character +func (parser *Parser) tokenChar(value string) bool { + token := parser.nextToken() + return (token.Type == scanner.TokenChar) && (token.Value == value) +} + +// Returns true if next token marks the end of a prelude +func (parser *Parser) tokenEndOfPrelude() bool { + return parser.tokenChar(";") || parser.tokenChar("{") +} diff --git a/vendor/github.com/gorilla/css/LICENSE b/vendor/github.com/gorilla/css/LICENSE new file mode 100644 index 000000000..ee0d53cef --- /dev/null +++ b/vendor/github.com/gorilla/css/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2023 The Gorilla Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/vendor/github.com/gorilla/css/scanner/doc.go b/vendor/github.com/gorilla/css/scanner/doc.go new file mode 100644 index 000000000..f19850e15 --- /dev/null +++ b/vendor/github.com/gorilla/css/scanner/doc.go @@ -0,0 +1,33 @@ +// Copyright 2012 The Gorilla Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package gorilla/css/scanner generates tokens for a CSS3 input. + +It follows the CSS3 specification located at: + + http://www.w3.org/TR/css3-syntax/ + +To use it, create a new scanner for a given CSS string and call Next() until +the token returned has type TokenEOF or TokenError: + + s := scanner.New(myCSS) + for { + token := s.Next() + if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { + break + } + // Do something with the token... + } + +Following the CSS3 specification, an error can only occur when the scanner +finds an unclosed quote or unclosed comment. In these cases the text becomes +"untokenizable". Everything else is tokenizable and it is up to a parser +to make sense of the token stream (or ignore nonsensical token sequences). + +Note: the scanner doesn't perform lexical analysis or, in other words, it +doesn't care about the token context. It is intended to be used by a +lexer or parser. +*/ +package scanner diff --git a/vendor/github.com/gorilla/css/scanner/scanner.go b/vendor/github.com/gorilla/css/scanner/scanner.go new file mode 100644 index 000000000..25a7c6576 --- /dev/null +++ b/vendor/github.com/gorilla/css/scanner/scanner.go @@ -0,0 +1,360 @@ +// Copyright 2012 The Gorilla Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scanner + +import ( + "fmt" + "regexp" + "strings" + "unicode" + "unicode/utf8" +) + +// tokenType identifies the type of lexical tokens. +type tokenType int + +// String returns a string representation of the token type. +func (t tokenType) String() string { + return tokenNames[t] +} + +// Token represents a token and the corresponding string. +type Token struct { + Type tokenType + Value string + Line int + Column int +} + +// String returns a string representation of the token. +func (t *Token) String() string { + if len(t.Value) > 10 { + return fmt.Sprintf("%s (line: %d, column: %d): %.10q...", + t.Type, t.Line, t.Column, t.Value) + } + return fmt.Sprintf("%s (line: %d, column: %d): %q", + t.Type, t.Line, t.Column, t.Value) +} + +// All tokens ----------------------------------------------------------------- + +// The complete list of tokens in CSS3. +const ( + // Scanner flags. + TokenError tokenType = iota + TokenEOF + // From now on, only tokens from the CSS specification. + TokenIdent + TokenAtKeyword + TokenString + TokenHash + TokenNumber + TokenPercentage + TokenDimension + TokenURI + TokenUnicodeRange + TokenCDO + TokenCDC + TokenS + TokenComment + TokenFunction + TokenIncludes + TokenDashMatch + TokenPrefixMatch + TokenSuffixMatch + TokenSubstringMatch + TokenChar + TokenBOM +) + +// tokenNames maps tokenType's to their names. Used for conversion to string. +var tokenNames = map[tokenType]string{ + TokenError: "error", + TokenEOF: "EOF", + TokenIdent: "IDENT", + TokenAtKeyword: "ATKEYWORD", + TokenString: "STRING", + TokenHash: "HASH", + TokenNumber: "NUMBER", + TokenPercentage: "PERCENTAGE", + TokenDimension: "DIMENSION", + TokenURI: "URI", + TokenUnicodeRange: "UNICODE-RANGE", + TokenCDO: "CDO", + TokenCDC: "CDC", + TokenS: "S", + TokenComment: "COMMENT", + TokenFunction: "FUNCTION", + TokenIncludes: "INCLUDES", + TokenDashMatch: "DASHMATCH", + TokenPrefixMatch: "PREFIXMATCH", + TokenSuffixMatch: "SUFFIXMATCH", + TokenSubstringMatch: "SUBSTRINGMATCH", + TokenChar: "CHAR", + TokenBOM: "BOM", +} + +// Macros and productions ----------------------------------------------------- +// http://www.w3.org/TR/css3-syntax/#tokenization + +var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`) + +// macros maps macro names to patterns to be expanded. +var macros = map[string]string{ + // must be escaped: `\.+*?()|[]{}^$` + "ident": `-?{nmstart}{nmchar}*`, + "name": `{nmchar}+`, + "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`, + "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", + "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`, + "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", + "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`, + "num": `[0-9]*\.[0-9]+|[0-9]+`, + "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, + "stringchar": `{urlchar}|[ ]|\\{nl}`, + "nl": `[\n\r\f]|\r\n`, + "w": `{wc}*`, + "wc": `[\t\n\f\r ]`, + + // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}] + // ASCII characters range = `[\u0020-\u007e]` + // Skip space \u0020 = `[\u0021-\u007e]` + // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]` + // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]` + // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]` + // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves + "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", +} + +// productions maps the list of tokens to patterns to be expanded. +var productions = map[tokenType]string{ + // Unused regexps (matched using other methods) are commented out. + TokenIdent: `{ident}`, + TokenAtKeyword: `@{ident}`, + TokenString: `{string}`, + TokenHash: `#{name}`, + TokenNumber: `{num}`, + TokenPercentage: `{num}%`, + TokenDimension: `{num}{ident}`, + TokenURI: `url\({w}(?:{string}|{urlchar}*?){w}\)`, + TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`, + //TokenCDO: ``, + TokenS: `{wc}+`, + TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, + TokenFunction: `{ident}\(`, + //TokenIncludes: `~=`, + //TokenDashMatch: `\|=`, + //TokenPrefixMatch: `\^=`, + //TokenSuffixMatch: `\$=`, + //TokenSubstringMatch: `\*=`, + //TokenChar: `[^"']`, + //TokenBOM: "\uFEFF", +} + +// matchers maps the list of tokens to compiled regular expressions. +// +// The map is filled on init() using the macros and productions defined in +// the CSS specification. +var matchers = map[tokenType]*regexp.Regexp{} + +// matchOrder is the order to test regexps when first-char shortcuts +// can't be used. +var matchOrder = []tokenType{ + TokenURI, + TokenFunction, + TokenUnicodeRange, + TokenIdent, + TokenDimension, + TokenPercentage, + TokenNumber, + TokenCDC, +} + +func init() { + // replace macros and compile regexps for productions. + replaceMacro := func(s string) string { + return "(?:" + macros[s[1:len(s)-1]] + ")" + } + for t, s := range productions { + for macroRegexp.MatchString(s) { + s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) + } + matchers[t] = regexp.MustCompile("^(?:" + s + ")") + } +} + +// Scanner -------------------------------------------------------------------- + +// New returns a new CSS scanner for the given input. +func New(input string) *Scanner { + // Normalize newlines. + // https://www.w3.org/TR/css-syntax-3/#input-preprocessing + input = strings.Replace(input, "\r\n", "\n", -1) + input = strings.Replace(input, "\r", "\n", -1) + input = strings.Replace(input, "\f", "\n", -1) + input = strings.Replace(input, "\u0000", "\ufffd", -1) + return &Scanner{ + input: input, + row: 1, + col: 1, + } +} + +// Scanner scans an input and emits tokens following the CSS3 specification. +type Scanner struct { + input string + pos int + row int + col int + err *Token +} + +// Next returns the next token from the input. +// +// At the end of the input the token type is TokenEOF. +// +// If the input can't be tokenized the token type is TokenError. This occurs +// in case of unclosed quotation marks or comments. +func (s *Scanner) Next() *Token { + if s.err != nil { + return s.err + } + if s.pos >= len(s.input) { + s.err = &Token{TokenEOF, "", s.row, s.col} + return s.err + } + if s.pos == 0 { + // Test BOM only once, at the beginning of the file. + if strings.HasPrefix(s.input, "\uFEFF") { + return s.emitSimple(TokenBOM, "\uFEFF") + } + } + // There's a lot we can guess based on the first byte so we'll take a + // shortcut before testing multiple regexps. + input := s.input[s.pos:] + switch input[0] { + case '\t', '\n', ' ': + // Whitespace. + return s.emitToken(TokenS, matchers[TokenS].FindString(input)) + case '.': + // Dot is too common to not have a quick check. + // We'll test if this is a Char; if it is followed by a number it is a + // dimension/percentage/number, and this will be matched later. + if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { + return s.emitSimple(TokenChar, ".") + } + case '#': + // Another common one: Hash or Char. + if match := matchers[TokenHash].FindString(input); match != "" { + return s.emitToken(TokenHash, match) + } + return s.emitSimple(TokenChar, "#") + case '@': + // Another common one: AtKeyword or Char. + if match := matchers[TokenAtKeyword].FindString(input); match != "" { + return s.emitSimple(TokenAtKeyword, match) + } + return s.emitSimple(TokenChar, "@") + case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': + // More common chars. + return s.emitSimple(TokenChar, string(input[0])) + case '"', '\'': + // String or error. + match := matchers[TokenString].FindString(input) + if match != "" { + return s.emitToken(TokenString, match) + } + + s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} + return s.err + case '/': + // Comment, error or Char. + if len(input) > 1 && input[1] == '*' { + match := matchers[TokenComment].FindString(input) + if match != "" { + return s.emitToken(TokenComment, match) + } else { + s.err = &Token{TokenError, "unclosed comment", s.row, s.col} + return s.err + } + } + return s.emitSimple(TokenChar, "/") + case '~': + // Includes or Char. + return s.emitPrefixOrChar(TokenIncludes, "~=") + case '|': + // DashMatch or Char. + return s.emitPrefixOrChar(TokenDashMatch, "|=") + case '^': + // PrefixMatch or Char. + return s.emitPrefixOrChar(TokenPrefixMatch, "^=") + case '$': + // SuffixMatch or Char. + return s.emitPrefixOrChar(TokenSuffixMatch, "$=") + case '*': + // SubstringMatch or Char. + return s.emitPrefixOrChar(TokenSubstringMatch, "*=") + case '<': + // CDO or Char. + return s.emitPrefixOrChar(TokenCDO, " which includes the use of that to permit +// conditionals as per https://docs.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/compatibility/ms537512(v=vs.85)?redirectedfrom=MSDN +// +// What is not permitted are CDATA XML comments, as the x/net/html package we depend +// on does not handle this fully and we are not choosing to take on that work: +// https://pkg.go.dev/golang.org/x/net/html#Tokenizer.AllowCDATA . If the x/net/html +// package changes this then these will be considered, otherwise if you AllowComments +// but provide a CDATA comment, then as per the documentation in x/net/html this will +// be treated as a plain HTML comment. +func (p *Policy) AllowComments() { + p.allowComments = true +} + +// AllowNoAttrs says that attributes on element are optional. +// +// The attribute policy is only added to the core policy when OnElements(...) +// are called. +func (p *Policy) AllowNoAttrs() *attrPolicyBuilder { + + p.init() + + abp := attrPolicyBuilder{ + p: p, + allowEmpty: true, + } + return &abp +} + +// AllowNoAttrs says that attributes on element are optional. +// +// The attribute policy is only added to the core policy when OnElements(...) +// are called. +func (abp *attrPolicyBuilder) AllowNoAttrs() *attrPolicyBuilder { + + abp.allowEmpty = true + + return abp +} + +// Matching allows a regular expression to be applied to a nascent attribute +// policy, and returns the attribute policy. +func (abp *attrPolicyBuilder) Matching(regex *regexp.Regexp) *attrPolicyBuilder { + + abp.regexp = regex + + return abp +} + +// OnElements will bind an attribute policy to a given range of HTML elements +// and return the updated policy +func (abp *attrPolicyBuilder) OnElements(elements ...string) *Policy { + + for _, element := range elements { + element = strings.ToLower(element) + + for _, attr := range abp.attrNames { + + if _, ok := abp.p.elsAndAttrs[element]; !ok { + abp.p.elsAndAttrs[element] = make(map[string][]attrPolicy) + } + + ap := attrPolicy{} + if abp.regexp != nil { + ap.regexp = abp.regexp + } + + abp.p.elsAndAttrs[element][attr] = append(abp.p.elsAndAttrs[element][attr], ap) + } + + if abp.allowEmpty { + abp.p.setOfElementsAllowedWithoutAttrs[element] = struct{}{} + + if _, ok := abp.p.elsAndAttrs[element]; !ok { + abp.p.elsAndAttrs[element] = make(map[string][]attrPolicy) + } + } + } + + return abp.p +} + +// OnElementsMatching will bind an attribute policy to all elements matching a given regex +// and return the updated policy +func (abp *attrPolicyBuilder) OnElementsMatching(regex *regexp.Regexp) *Policy { + for _, attr := range abp.attrNames { + if _, ok := abp.p.elsMatchingAndAttrs[regex]; !ok { + abp.p.elsMatchingAndAttrs[regex] = make(map[string][]attrPolicy) + } + ap := attrPolicy{} + if abp.regexp != nil { + ap.regexp = abp.regexp + } + abp.p.elsMatchingAndAttrs[regex][attr] = append(abp.p.elsMatchingAndAttrs[regex][attr], ap) + } + + if abp.allowEmpty { + abp.p.setOfElementsMatchingAllowedWithoutAttrs = append(abp.p.setOfElementsMatchingAllowedWithoutAttrs, regex) + if _, ok := abp.p.elsMatchingAndAttrs[regex]; !ok { + abp.p.elsMatchingAndAttrs[regex] = make(map[string][]attrPolicy) + } + } + + return abp.p +} + +// Globally will bind an attribute policy to all HTML elements and return the +// updated policy +func (abp *attrPolicyBuilder) Globally() *Policy { + + for _, attr := range abp.attrNames { + if _, ok := abp.p.globalAttrs[attr]; !ok { + abp.p.globalAttrs[attr] = []attrPolicy{} + } + + ap := attrPolicy{} + if abp.regexp != nil { + ap.regexp = abp.regexp + } + + abp.p.globalAttrs[attr] = append(abp.p.globalAttrs[attr], ap) + } + + return abp.p +} + +// AllowStyles takes a range of CSS property names and returns a +// style policy builder that allows you to specify the pattern and scope of +// the allowed property. +// +// The style policy is only added to the core policy when either Globally() +// or OnElements(...) are called. +func (p *Policy) AllowStyles(propertyNames ...string) *stylePolicyBuilder { + + p.init() + + abp := stylePolicyBuilder{ + p: p, + } + + for _, propertyName := range propertyNames { + abp.propertyNames = append(abp.propertyNames, strings.ToLower(propertyName)) + } + + return &abp +} + +// Matching allows a regular expression to be applied to a nascent style +// policy, and returns the style policy. +func (spb *stylePolicyBuilder) Matching(regex *regexp.Regexp) *stylePolicyBuilder { + + spb.regexp = regex + + return spb +} + +// MatchingEnum allows a list of allowed values to be applied to a nascent style +// policy, and returns the style policy. +func (spb *stylePolicyBuilder) MatchingEnum(enum ...string) *stylePolicyBuilder { + + spb.enum = enum + + return spb +} + +// MatchingHandler allows a handler to be applied to a nascent style +// policy, and returns the style policy. +func (spb *stylePolicyBuilder) MatchingHandler(handler func(string) bool) *stylePolicyBuilder { + + spb.handler = handler + + return spb +} + +// OnElements will bind a style policy to a given range of HTML elements +// and return the updated policy +func (spb *stylePolicyBuilder) OnElements(elements ...string) *Policy { + + for _, element := range elements { + element = strings.ToLower(element) + + for _, attr := range spb.propertyNames { + + if _, ok := spb.p.elsAndStyles[element]; !ok { + spb.p.elsAndStyles[element] = make(map[string][]stylePolicy) + } + + sp := stylePolicy{} + if spb.handler != nil { + sp.handler = spb.handler + } else if len(spb.enum) > 0 { + sp.enum = spb.enum + } else if spb.regexp != nil { + sp.regexp = spb.regexp + } else { + sp.handler = css.GetDefaultHandler(attr) + } + spb.p.elsAndStyles[element][attr] = append(spb.p.elsAndStyles[element][attr], sp) + } + } + + return spb.p +} + +// OnElementsMatching will bind a style policy to any HTML elements matching the pattern +// and return the updated policy +func (spb *stylePolicyBuilder) OnElementsMatching(regex *regexp.Regexp) *Policy { + + for _, attr := range spb.propertyNames { + + if _, ok := spb.p.elsMatchingAndStyles[regex]; !ok { + spb.p.elsMatchingAndStyles[regex] = make(map[string][]stylePolicy) + } + + sp := stylePolicy{} + if spb.handler != nil { + sp.handler = spb.handler + } else if len(spb.enum) > 0 { + sp.enum = spb.enum + } else if spb.regexp != nil { + sp.regexp = spb.regexp + } else { + sp.handler = css.GetDefaultHandler(attr) + } + spb.p.elsMatchingAndStyles[regex][attr] = append(spb.p.elsMatchingAndStyles[regex][attr], sp) + } + + return spb.p +} + +// Globally will bind a style policy to all HTML elements and return the +// updated policy +func (spb *stylePolicyBuilder) Globally() *Policy { + + for _, attr := range spb.propertyNames { + if _, ok := spb.p.globalStyles[attr]; !ok { + spb.p.globalStyles[attr] = []stylePolicy{} + } + + // Use only one strategy for validating styles, fallback to default + sp := stylePolicy{} + if spb.handler != nil { + sp.handler = spb.handler + } else if len(spb.enum) > 0 { + sp.enum = spb.enum + } else if spb.regexp != nil { + sp.regexp = spb.regexp + } else { + sp.handler = css.GetDefaultHandler(attr) + } + spb.p.globalStyles[attr] = append(spb.p.globalStyles[attr], sp) + } + + return spb.p +} + +// AllowElements will append HTML elements to the allowlist without applying an +// attribute policy to those elements (the elements are permitted +// sans-attributes) +func (p *Policy) AllowElements(names ...string) *Policy { + p.init() + + for _, element := range names { + element = strings.ToLower(element) + + if _, ok := p.elsAndAttrs[element]; !ok { + p.elsAndAttrs[element] = make(map[string][]attrPolicy) + } + } + + return p +} + +// AllowElementsMatching will append HTML elements to the allowlist if they +// match a regexp. +func (p *Policy) AllowElementsMatching(regex *regexp.Regexp) *Policy { + p.init() + if _, ok := p.elsMatchingAndAttrs[regex]; !ok { + p.elsMatchingAndAttrs[regex] = make(map[string][]attrPolicy) + } + return p +} + +// AllowURLSchemesMatching will append URL schemes to the allowlist if they +// match a regexp. +func (p *Policy) AllowURLSchemesMatching(r *regexp.Regexp) *Policy { + p.allowURLSchemeRegexps = append(p.allowURLSchemeRegexps, r) + return p +} + +// RewriteSrc will rewrite the src attribute of a resource downloading tag +// (e.g. , tag. +func (p *Policy) addDefaultSkipElementContent() { + p.init() + + p.setOfElementsToSkipContent["frame"] = struct{}{} + p.setOfElementsToSkipContent["frameset"] = struct{}{} + p.setOfElementsToSkipContent["iframe"] = struct{}{} + p.setOfElementsToSkipContent["noembed"] = struct{}{} + p.setOfElementsToSkipContent["noframes"] = struct{}{} + p.setOfElementsToSkipContent["noscript"] = struct{}{} + p.setOfElementsToSkipContent["nostyle"] = struct{}{} + p.setOfElementsToSkipContent["object"] = struct{}{} + p.setOfElementsToSkipContent["script"] = struct{}{} + p.setOfElementsToSkipContent["style"] = struct{}{} + p.setOfElementsToSkipContent["title"] = struct{}{} +} diff --git a/vendor/github.com/microcosm-cc/bluemonday/sanitize.go b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go new file mode 100644 index 000000000..47c31f7da --- /dev/null +++ b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go @@ -0,0 +1,1096 @@ +// Copyright (c) 2014, David Kitchen +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the organisation (Microcosm) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package bluemonday + +import ( + "bytes" + "fmt" + "io" + "net/url" + "regexp" + "strconv" + "strings" + + "golang.org/x/net/html" + + "github.com/aymerick/douceur/parser" +) + +var ( + dataAttribute = regexp.MustCompile("^data-.+") + dataAttributeXMLPrefix = regexp.MustCompile("^xml.+") + dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+") + cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`) + dataURIbase64Prefix = regexp.MustCompile(`^data:[^,]*;base64,`) +) + +// Sanitize takes a string that contains a HTML fragment or document and applies +// the given policy allowlist. +// +// It returns a HTML string that has been sanitized by the policy or an empty +// string if an error has occurred (most likely as a consequence of extremely +// malformed input) +func (p *Policy) Sanitize(s string) string { + if strings.TrimSpace(s) == "" { + return s + } + + return p.sanitizeWithBuff(strings.NewReader(s)).String() +} + +// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies +// the given policy allowlist. +// +// It returns a []byte containing the HTML that has been sanitized by the policy +// or an empty []byte if an error has occurred (most likely as a consequence of +// extremely malformed input) +func (p *Policy) SanitizeBytes(b []byte) []byte { + if len(bytes.TrimSpace(b)) == 0 { + return b + } + + return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes() +} + +// SanitizeReader takes an io.Reader that contains a HTML fragment or document +// and applies the given policy allowlist. +// +// It returns a bytes.Buffer containing the HTML that has been sanitized by the +// policy. Errors during sanitization will merely return an empty result. +func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer { + return p.sanitizeWithBuff(r) +} + +// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document +// and applies the given policy allowlist and writes to the provided writer returning +// an error if there is one. +func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error { + return p.sanitize(r, w) +} + +// Query represents a single part of the query string, a query param +type Query struct { + Key string + Value string + HasValue bool +} + +func parseQuery(query string) (values []Query, err error) { + // This is essentially a copy of parseQuery from + // https://golang.org/src/net/url/url.go but adjusted to build our values + // based on our type, which we need to preserve the ordering of the query + // string + for query != "" { + key := query + if i := strings.IndexAny(key, "&;"); i >= 0 { + key, query = key[:i], key[i+1:] + } else { + query = "" + } + if key == "" { + continue + } + value := "" + hasValue := false + if i := strings.Index(key, "="); i >= 0 { + key, value = key[:i], key[i+1:] + hasValue = true + } + key, err1 := url.QueryUnescape(key) + if err1 != nil { + if err == nil { + err = err1 + } + continue + } + value, err1 = url.QueryUnescape(value) + if err1 != nil { + if err == nil { + err = err1 + } + continue + } + values = append(values, Query{ + Key: key, + Value: value, + HasValue: hasValue, + }) + } + return values, err +} + +func encodeQueries(queries []Query) string { + var buff bytes.Buffer + for i, query := range queries { + buff.WriteString(url.QueryEscape(query.Key)) + if query.HasValue { + buff.WriteString("=") + buff.WriteString(url.QueryEscape(query.Value)) + } + if i < len(queries)-1 { + buff.WriteString("&") + } + } + return buff.String() +} + +func sanitizedURL(val string) (string, error) { + u, err := url.Parse(val) + if err != nil { + return "", err + } + + // we use parseQuery but not u.Query to keep the order not change because + // url.Values is a map which has a random order. + queryValues, err := parseQuery(u.RawQuery) + if err != nil { + return "", err + } + // sanitize the url query params + for i, query := range queryValues { + queryValues[i].Key = html.EscapeString(query.Key) + } + u.RawQuery = encodeQueries(queryValues) + // u.String() will also sanitize host/scheme/user/pass + return u.String(), nil +} + +// Performs the actual sanitization process. +func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer { + var buff bytes.Buffer + if err := p.sanitize(r, &buff); err != nil { + return &bytes.Buffer{} + } + return &buff +} + +type asStringWriter struct { + io.Writer +} + +func (a *asStringWriter) WriteString(s string) (int, error) { + return a.Write([]byte(s)) +} + +func (p *Policy) sanitize(r io.Reader, w io.Writer) error { + // It is possible that the developer has created the policy via: + // p := bluemonday.Policy{} + // rather than: + // p := bluemonday.NewPolicy() + // If this is the case, and if they haven't yet triggered an action that + // would initialize the maps, then we need to do that. + p.init() + + buff, ok := w.(stringWriterWriter) + if !ok { + buff = &asStringWriter{w} + } + + var ( + skipElementContent bool + skippingElementsCount int64 + skipClosingTag bool + closingTagToSkipStack []string + mostRecentlyStartedToken string + ) + + tokenizer := html.NewTokenizer(r) + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + // End of input means end of processing + return nil + } + + // Raw tokenizer error + return err + } + + token := tokenizer.Token() + switch token.Type { + case html.DoctypeToken: + + // DocType is not handled as there is no safe parsing mechanism + // provided by golang.org/x/net/html for the content, and this can + // be misused to insert HTML tags that are not then sanitized + // + // One might wish to recursively sanitize here using the same policy + // but I will need to do some further testing before considering + // this. + + case html.CommentToken: + + // Comments are ignored by default + if p.allowComments { + // But if allowed then write the comment out as-is + buff.WriteString(token.String()) + } + + case html.StartTagToken: + + mostRecentlyStartedToken = normaliseElementName(token.Data) + + switch normaliseElementName(token.Data) { + case `script`: + if !p.allowUnsafe { + continue + } + case `style`: + if !p.allowUnsafe { + continue + } + } + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + aa, matched := p.matchRegex(token.Data) + if !matched { + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { + skipElementContent = true + skippingElementsCount++ + } + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + aps = aa + } + if len(token.Attr) != 0 { + token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) + } + + if len(token.Attr) == 0 { + if !p.allowNoAttrs(token.Data) { + skipClosingTag = true + closingTagToSkipStack = append(closingTagToSkipStack, token.Data) + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + } + + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.EndTagToken: + + if mostRecentlyStartedToken == normaliseElementName(token.Data) { + mostRecentlyStartedToken = "" + } + + switch normaliseElementName(token.Data) { + case `script`: + if !p.allowUnsafe { + continue + } + case `style`: + if !p.allowUnsafe { + continue + } + } + + if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data { + closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1] + if len(closingTagToSkipStack) == 0 { + skipClosingTag = false + } + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + if _, ok := p.elsAndAttrs[token.Data]; !ok { + match := false + for regex := range p.elsMatchingAndAttrs { + if regex.MatchString(token.Data) { + skipElementContent = false + match = true + break + } + } + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match { + skippingElementsCount-- + if skippingElementsCount == 0 { + skipElementContent = false + } + } + if !match { + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + } + + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.SelfClosingTagToken: + + switch normaliseElementName(token.Data) { + case `script`: + if !p.allowUnsafe { + continue + } + case `style`: + if !p.allowUnsafe { + continue + } + } + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + aa, matched := p.matchRegex(token.Data) + if !matched { + if p.addSpaces && !matched { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + aps = aa + } + + if len(token.Attr) != 0 { + token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) + } + + if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.TextToken: + + if !skipElementContent { + switch mostRecentlyStartedToken { + case `script`: + // not encouraged, but if a policy allows JavaScript we + // should not HTML escape it as that would break the output + // + // requires p.AllowUnsafe() + if p.allowUnsafe { + if _, err := buff.WriteString(token.Data); err != nil { + return err + } + } + case "style": + // not encouraged, but if a policy allows CSS styles we + // should not HTML escape it as that would break the output + // + // requires p.AllowUnsafe() + if p.allowUnsafe { + if _, err := buff.WriteString(token.Data); err != nil { + return err + } + } + default: + // HTML escape the text + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + } + + default: + // A token that didn't exist in the html package when we wrote this + return fmt.Errorf("unknown token: %v", token) + } + } +} + +// sanitizeAttrs takes a set of element attribute policies and the global +// attribute policies and applies them to the []html.Attribute returning a set +// of html.Attributes that match the policies +func (p *Policy) sanitizeAttrs( + elementName string, + attrs []html.Attribute, + aps map[string][]attrPolicy, +) []html.Attribute { + + if len(attrs) == 0 { + return attrs + } + + hasStylePolicies := false + sps, elementHasStylePolicies := p.elsAndStyles[elementName] + if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) { + hasStylePolicies = true + } + // no specific element policy found, look for a pattern match + if !hasStylePolicies { + for k, v := range p.elsMatchingAndStyles { + if k.MatchString(elementName) { + if len(v) > 0 { + hasStylePolicies = true + break + } + } + } + } + + // Builds a new attribute slice based on the whether the attribute has been + // allowed explicitly or globally. + cleanAttrs := []html.Attribute{} +attrsLoop: + for _, htmlAttr := range attrs { + if p.allowDataAttributes { + // If we see a data attribute, let it through. + if isDataAttribute(htmlAttr.Key) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } + // Is this a "style" attribute, and if so, do we need to sanitize it? + if htmlAttr.Key == "style" && hasStylePolicies { + htmlAttr = p.sanitizeStyles(htmlAttr, elementName) + if htmlAttr.Val == "" { + // We've sanitized away any and all styles; don't bother to + // output the style attribute (even if it's allowed) + continue + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } + + // Is there an element specific attribute policy that applies? + if apl, ok := aps[htmlAttr.Key]; ok { + for _, ap := range apl { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } + } + + // Is there a global attribute policy that applies? + if apl, ok := p.globalAttrs[htmlAttr.Key]; ok { + for _, ap := range apl { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } + } + } + + if len(cleanAttrs) == 0 { + // If nothing was allowed, let's get out of here + return cleanAttrs + } + // cleanAttrs now contains the attributes that are permitted + + if linkable(elementName) { + if p.requireParseableURLs { + // Ensure URLs are parseable: + // - a.href + // - area.href + // - link.href + // - blockquote.cite + // - q.cite + // - img.src + // - script.src + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + switch elementName { + case "a", "area", "base", "link": + if htmlAttr.Key == "href" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "blockquote", "del", "ins", "q": + if htmlAttr.Key == "cite" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "audio", "embed", "iframe", "img", "script", "source", "track", "video": + if htmlAttr.Key == "src" { + if u, ok := p.validURL(htmlAttr.Val); ok { + if p.srcRewriter != nil { + parsedURL, err := url.Parse(u) + if err != nil { + fmt.Println(err) + } + p.srcRewriter(parsedURL) + u = parsedURL.String() + } + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + default: + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + cleanAttrs = tmpAttrs + } + + if (p.requireNoFollow || + p.requireNoFollowFullyQualifiedLinks || + p.requireNoReferrer || + p.requireNoReferrerFullyQualifiedLinks || + p.addTargetBlankToFullyQualifiedLinks) && + len(cleanAttrs) > 0 { + + // Add rel="nofollow" if a "href" exists + switch elementName { + case "a", "area", "base", "link": + var hrefFound bool + var externalLink bool + for _, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "href" { + hrefFound = true + + u, err := url.Parse(htmlAttr.Val) + if err != nil { + continue + } + if u.Host != "" { + externalLink = true + } + + continue + } + } + + if hrefFound { + var ( + noFollowFound bool + noReferrerFound bool + targetBlankFound bool + ) + + addNoFollow := (p.requireNoFollow || + externalLink && p.requireNoFollowFullyQualifiedLinks) + + addNoReferrer := (p.requireNoReferrer || + externalLink && p.requireNoReferrerFullyQualifiedLinks) + + addTargetBlank := (externalLink && + p.addTargetBlankToFullyQualifiedLinks) + + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + + var appended bool + if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) { + + if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") { + htmlAttr.Val += " nofollow" + } + if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") { + htmlAttr.Val += " noreferrer" + } + noFollowFound = addNoFollow + noReferrerFound = addNoReferrer + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + + if elementName == "a" && htmlAttr.Key == "target" { + if htmlAttr.Val == "_blank" { + targetBlankFound = true + } + if addTargetBlank && !targetBlankFound { + htmlAttr.Val = "_blank" + targetBlankFound = true + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + } + + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noFollowFound || noReferrerFound || targetBlankFound { + cleanAttrs = tmpAttrs + } + + if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) { + rel := html.Attribute{} + rel.Key = "rel" + if addNoFollow { + rel.Val = "nofollow" + } + if addNoReferrer { + if rel.Val != "" { + rel.Val += " " + } + rel.Val += "noreferrer" + } + cleanAttrs = append(cleanAttrs, rel) + } + + if elementName == "a" && addTargetBlank && !targetBlankFound { + rel := html.Attribute{} + rel.Key = "target" + rel.Val = "_blank" + targetBlankFound = true + cleanAttrs = append(cleanAttrs, rel) + } + + if targetBlankFound { + // target="_blank" has a security risk that allows the + // opened window/tab to issue JavaScript calls against + // window.opener, which in effect allow the destination + // of the link to control the source: + // https://dev.to/ben/the-targetblank-vulnerability-by-example + // + // To mitigate this risk, we need to add a specific rel + // attribute if it is not already present. + // rel="noopener" + // + // Unfortunately this is processing the rel twice (we + // already looked at it earlier ^^) as we cannot be sure + // of the ordering of the href and rel, and whether we + // have fully satisfied that we need to do this. This + // double processing only happens *if* target="_blank" + // is true. + var noOpenerAdded bool + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + var appended bool + if htmlAttr.Key == "rel" { + if strings.Contains(htmlAttr.Val, "noopener") { + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } else { + htmlAttr.Val += " noopener" + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } + + appended = true + } + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noOpenerAdded { + cleanAttrs = tmpAttrs + } else { + // rel attr was not found, or else noopener would + // have been added already + rel := html.Attribute{} + rel.Key = "rel" + rel.Val = "noopener" + cleanAttrs = append(cleanAttrs, rel) + } + + } + } + default: + } + } + } + + if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 { + switch elementName { + case "audio", "img", "link", "script", "video": + var crossOriginFound bool + for i, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "crossorigin" { + crossOriginFound = true + cleanAttrs[i].Val = "anonymous" + } + } + + if !crossOriginFound { + crossOrigin := html.Attribute{} + crossOrigin.Key = "crossorigin" + crossOrigin.Val = "anonymous" + cleanAttrs = append(cleanAttrs, crossOrigin) + } + } + } + + if p.requireSandboxOnIFrame != nil && elementName == "iframe" { + var sandboxFound bool + for i, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "sandbox" { + sandboxFound = true + var cleanVals []string + cleanValsSet := make(map[string]bool) + for _, val := range strings.Fields(htmlAttr.Val) { + if p.requireSandboxOnIFrame[val] { + if !cleanValsSet[val] { + cleanVals = append(cleanVals, val) + cleanValsSet[val] = true + } + } + } + cleanAttrs[i].Val = strings.Join(cleanVals, " ") + } + } + + if !sandboxFound { + sandbox := html.Attribute{} + sandbox.Key = "sandbox" + sandbox.Val = "" + cleanAttrs = append(cleanAttrs, sandbox) + } + } + + return cleanAttrs +} + +func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute { + sps := p.elsAndStyles[elementName] + if len(sps) == 0 { + sps = map[string][]stylePolicy{} + // check for any matching elements, if we don't already have a policy found + // if multiple matches are found they will be overwritten, it's best + // to not have overlapping matchers + for regex, policies := range p.elsMatchingAndStyles { + if regex.MatchString(elementName) { + for k, v := range policies { + sps[k] = append(sps[k], v...) + } + } + } + } + + //Add semi-colon to end to fix parsing issue + attr.Val = strings.TrimRight(attr.Val, " ") + if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' { + attr.Val = attr.Val + ";" + } + decs, err := parser.ParseDeclarations(attr.Val) + if err != nil { + attr.Val = "" + return attr + } + clean := []string{} + prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"} + +decLoop: + for _, dec := range decs { + tempProperty := strings.ToLower(dec.Property) + tempValue := removeUnicode(strings.ToLower(dec.Value)) + for _, i := range prefixes { + tempProperty = strings.TrimPrefix(tempProperty, i) + } + if spl, ok := sps[tempProperty]; ok { + for _, sp := range spl { + if sp.handler != nil { + if sp.handler(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if len(sp.enum) > 0 { + if stringInSlice(tempValue, sp.enum) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if sp.regexp != nil { + if sp.regexp.MatchString(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } + } + } + if spl, ok := p.globalStyles[tempProperty]; ok { + for _, sp := range spl { + if sp.handler != nil { + if sp.handler(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if len(sp.enum) > 0 { + if stringInSlice(tempValue, sp.enum) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if sp.regexp != nil { + if sp.regexp.MatchString(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } + } + } + } + if len(clean) > 0 { + attr.Val = strings.Join(clean, "; ") + } else { + attr.Val = "" + } + return attr +} + +func (p *Policy) allowNoAttrs(elementName string) bool { + _, ok := p.setOfElementsAllowedWithoutAttrs[elementName] + if !ok { + for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs { + if r.MatchString(elementName) { + ok = true + break + } + } + } + return ok +} + +func (p *Policy) validURL(rawurl string) (string, bool) { + if p.requireParseableURLs { + // URLs are valid if when space is trimmed the URL is valid + rawurl = strings.TrimSpace(rawurl) + + // URLs cannot contain whitespace, unless it is a data-uri + if strings.Contains(rawurl, " ") || + strings.Contains(rawurl, "\t") || + strings.Contains(rawurl, "\n") { + if !strings.HasPrefix(rawurl, `data:`) { + return "", false + } + + // Remove \r and \n from base64 encoded data to pass url.Parse. + matched := dataURIbase64Prefix.FindString(rawurl) + if matched != "" { + rawurl = matched + strings.Replace( + strings.Replace( + rawurl[len(matched):], + "\r", + "", + -1, + ), + "\n", + "", + -1, + ) + } + } + + // URLs are valid if they parse + u, err := url.Parse(rawurl) + if err != nil { + return "", false + } + + if u.Scheme != "" { + urlPolicies, ok := p.allowURLSchemes[u.Scheme] + if !ok { + for _, r := range p.allowURLSchemeRegexps { + if r.MatchString(u.Scheme) { + return u.String(), true + } + } + + return "", false + } + + if len(urlPolicies) == 0 { + return u.String(), true + } + + for _, urlPolicy := range urlPolicies { + if urlPolicy(u) { + return u.String(), true + } + } + + return "", false + } + + if p.allowRelativeURLs { + if u.String() != "" { + return u.String(), true + } + } + + return "", false + } + + return rawurl, true +} + +func linkable(elementName string) bool { + switch elementName { + case "a", "area", "base", "link": + // elements that allow .href + return true + case "blockquote", "del", "ins", "q": + // elements that allow .cite + return true + case "audio", "embed", "iframe", "img", "input", "script", "track", "video": + // elements that allow .src + return true + default: + return false + } +} + +// stringInSlice returns true if needle exists in haystack +func stringInSlice(needle string, haystack []string) bool { + for _, straw := range haystack { + if strings.EqualFold(straw, needle) { + return true + } + } + return false +} + +func isDataAttribute(val string) bool { + if !dataAttribute.MatchString(val) { + return false + } + rest := strings.Split(val, "data-") + if len(rest) == 1 { + return false + } + // data-xml* is invalid. + if dataAttributeXMLPrefix.MatchString(rest[1]) { + return false + } + // no uppercase or semi-colons allowed. + if dataAttributeInvalidChars.MatchString(rest[1]) { + return false + } + return true +} + +func removeUnicode(value string) string { + substitutedValue := value + currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue) + for currentLoc != nil { + + character := substitutedValue[currentLoc[0]+1 : currentLoc[1]] + character = strings.TrimSpace(character) + if len(character) < 4 { + character = strings.Repeat("0", 4-len(character)) + character + } else { + for len(character) > 4 { + if character[0] != '0' { + character = "" + break + } else { + character = character[1:] + } + } + } + character = "\\u" + character + translatedChar, err := strconv.Unquote(`"` + character + `"`) + translatedChar = strings.TrimSpace(translatedChar) + if err != nil { + return "" + } + substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:] + currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue) + } + return substitutedValue +} + +func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) { + aps := make(map[string][]attrPolicy, 0) + matched := false + for regex, attrs := range p.elsMatchingAndAttrs { + if regex.MatchString(elementName) { + matched = true + for k, v := range attrs { + aps[k] = append(aps[k], v...) + } + } + } + return aps, matched +} + +// normaliseElementName takes a HTML element like