From 9d09fd6af31896d2f015fc57622635b59a7e6369 Mon Sep 17 00:00:00 2001 From: Simon Frei Date: Wed, 7 Oct 2020 10:05:13 +0200 Subject: [PATCH] all: Add failure reporting (fixes #7028) (#6925) --- cmd/stcrashreceiver/main.go | 114 +++++++++++++++ cmd/stcrashreceiver/sentry.go | 63 +++++---- cmd/stcrashreceiver/sentry_test.go | 2 +- cmd/stcrashreceiver/stcrashreceiver.go | 57 +------- lib/events/events.go | 7 +- lib/model/folder.go | 6 +- lib/model/folder_sendonly.go | 1 + lib/model/model.go | 1 + lib/syncthing/syncthing.go | 2 + lib/ur/failurereporting.go | 188 +++++++++++++++++++++++++ 10 files changed, 356 insertions(+), 85 deletions(-) create mode 100644 cmd/stcrashreceiver/main.go create mode 100644 lib/ur/failurereporting.go diff --git a/cmd/stcrashreceiver/main.go b/cmd/stcrashreceiver/main.go new file mode 100644 index 000000000..4cf3edfd8 --- /dev/null +++ b/cmd/stcrashreceiver/main.go @@ -0,0 +1,114 @@ +// Copyright (C) 2019 The Syncthing Authors. +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this file, +// You can obtain one at https://mozilla.org/MPL/2.0/. + +// Command stcrashreceiver is a trivial HTTP server that allows two things: +// +// - uploading files (crash reports) named like a SHA256 hash using a PUT request +// - checking whether such file exists using a HEAD request +// +// Typically this should be deployed behind something that manages HTTPS. +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io" + "io/ioutil" + "log" + "net" + "net/http" + "os" + "time" + + "github.com/syncthing/syncthing/lib/sha256" + "github.com/syncthing/syncthing/lib/ur" + + raven "github.com/getsentry/raven-go" +) + +const maxRequestSize = 1 << 20 // 1 MiB + +func main() { + dir := flag.String("dir", ".", "Directory to store reports in") + dsn := flag.String("dsn", "", "Sentry DSN") + listen := flag.String("listen", ":22039", "HTTP listen address") + flag.Parse() + + mux := http.NewServeMux() + + cr := &crashReceiver{ + dir: *dir, + dsn: *dsn, + } + mux.Handle("/", cr) + + if *dsn != "" { + mux.HandleFunc("/failure", handleFailureFn(*dsn)) + } + + log.SetOutput(os.Stdout) + if err := http.ListenAndServe(*listen, mux); err != nil { + log.Fatalln("HTTP serve:", err) + } +} + +func handleFailureFn(dsn string) func(w http.ResponseWriter, req *http.Request) { + return func(w http.ResponseWriter, req *http.Request) { + lr := io.LimitReader(req.Body, maxRequestSize) + bs, err := ioutil.ReadAll(lr) + req.Body.Close() + if err != nil { + http.Error(w, err.Error(), 500) + return + } + + var reports []ur.FailureReport + err = json.Unmarshal(bs, &reports) + if err != nil { + http.Error(w, err.Error(), 400) + return + } + if len(reports) == 0 { + // Shouldn't happen + return + } + + version, err := parseVersion(reports[0].Version) + if err != nil { + http.Error(w, err.Error(), 400) + return + } + for _, r := range reports { + pkt := packet(version) + pkt.Message = r.Description + pkt.Extra = raven.Extra{ + "count": r.Count, + } + + if err := sendReport(dsn, pkt, userIDFor(req)); err != nil { + log.Println("Failed to send crash report:", err) + } + } + } +} + +// userIDFor returns a string we can use as the user ID for the purpose of +// counting affected users. It's the truncated hash of a salt, the user +// remote IP, and the current month. +func userIDFor(req *http.Request) string { + addr := req.RemoteAddr + if fwd := req.Header.Get("x-forwarded-for"); fwd != "" { + addr = fwd + } + if host, _, err := net.SplitHostPort(addr); err == nil { + addr = host + } + now := time.Now().Format("200601") + salt := "stcrashreporter" + hash := sha256.Sum256([]byte(salt + addr + now)) + return fmt.Sprintf("%x", hash[:8]) +} diff --git a/cmd/stcrashreceiver/sentry.go b/cmd/stcrashreceiver/sentry.go index 9ff9f68d8..295c55f32 100644 --- a/cmd/stcrashreceiver/sentry.go +++ b/cmd/stcrashreceiver/sentry.go @@ -31,12 +31,7 @@ var ( clientsMut sync.Mutex ) -func sendReport(dsn, path string, report []byte, userID string) error { - pkt, err := parseReport(path, report) - if err != nil { - return err - } - +func sendReport(dsn string, pkt *raven.Packet, userID string) error { pkt.Interfaces = append(pkt.Interfaces, &raven.User{ID: userID}) clientsMut.Lock() @@ -44,6 +39,7 @@ func sendReport(dsn, path string, report []byte, userID string) error { cli, ok := clients[dsn] if !ok { + var err error cli, err = raven.New(dsn) if err != nil { return err @@ -62,7 +58,7 @@ func sendReport(dsn, path string, report []byte, userID string) error { return <-errC } -func parseReport(path string, report []byte) (*raven.Packet, error) { +func parseCrashReport(path string, report []byte) (*raven.Packet, error) { parts := bytes.SplitN(report, []byte("\n"), 2) if len(parts) != 2 { return nil, errors.New("no first line") @@ -126,31 +122,12 @@ func parseReport(path string, report []byte) (*raven.Packet, error) { } } - pkt := &raven.Packet{ - Message: string(subjectLine), - Platform: "go", - Release: version.tag, - Environment: version.environment(), - Tags: raven.Tags{ - raven.Tag{Key: "version", Value: version.version}, - raven.Tag{Key: "tag", Value: version.tag}, - raven.Tag{Key: "codename", Value: version.codename}, - raven.Tag{Key: "runtime", Value: version.runtime}, - raven.Tag{Key: "goos", Value: version.goos}, - raven.Tag{Key: "goarch", Value: version.goarch}, - raven.Tag{Key: "builder", Value: version.builder}, - }, - Extra: raven.Extra{ - "url": reportServer + path, - }, - Interfaces: []raven.Interface{&trace}, - } - if version.commit != "" { - pkt.Tags = append(pkt.Tags, raven.Tag{Key: "commit", Value: version.commit}) - } - for _, tag := range version.extra { - pkt.Tags = append(pkt.Tags, raven.Tag{Key: tag, Value: "1"}) + pkt := packet(version) + pkt.Message = string(subjectLine) + pkt.Extra = raven.Extra{ + "url": reportServer + path, } + pkt.Interfaces = []raven.Interface{&trace} return pkt, nil } @@ -217,3 +194,27 @@ func parseVersion(line string) (version, error) { return v, nil } + +func packet(version version) *raven.Packet { + pkt := &raven.Packet{ + Platform: "go", + Release: version.tag, + Environment: version.environment(), + Tags: raven.Tags{ + raven.Tag{Key: "version", Value: version.version}, + raven.Tag{Key: "tag", Value: version.tag}, + raven.Tag{Key: "codename", Value: version.codename}, + raven.Tag{Key: "runtime", Value: version.runtime}, + raven.Tag{Key: "goos", Value: version.goos}, + raven.Tag{Key: "goarch", Value: version.goarch}, + raven.Tag{Key: "builder", Value: version.builder}, + }, + } + if version.commit != "" { + pkt.Tags = append(pkt.Tags, raven.Tag{Key: "commit", Value: version.commit}) + } + for _, tag := range version.extra { + pkt.Tags = append(pkt.Tags, raven.Tag{Key: tag, Value: "1"}) + } + return pkt +} diff --git a/cmd/stcrashreceiver/sentry_test.go b/cmd/stcrashreceiver/sentry_test.go index dfefa2580..4b7230e17 100644 --- a/cmd/stcrashreceiver/sentry_test.go +++ b/cmd/stcrashreceiver/sentry_test.go @@ -64,7 +64,7 @@ func TestParseReport(t *testing.T) { t.Fatal(err) } - pkt, err := parseReport("1/2/345", bs) + pkt, err := parseCrashReport("1/2/345", bs) if err != nil { t.Fatal(err) } diff --git a/cmd/stcrashreceiver/stcrashreceiver.go b/cmd/stcrashreceiver/stcrashreceiver.go index 9088cbe2e..dc7202274 100644 --- a/cmd/stcrashreceiver/stcrashreceiver.go +++ b/cmd/stcrashreceiver/stcrashreceiver.go @@ -4,52 +4,21 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this file, // You can obtain one at https://mozilla.org/MPL/2.0/. -// Command stcrashreceiver is a trivial HTTP server that allows two things: -// -// - uploading files (crash reports) named like a SHA256 hash using a PUT request -// - checking whether such file exists using a HEAD request -// -// Typically this should be deployed behind something that manages HTTPS. package main import ( "bytes" "compress/gzip" - "flag" - "fmt" "io" "io/ioutil" "log" - "net" "net/http" "os" "path" "path/filepath" "strings" - "time" - - "github.com/syncthing/syncthing/lib/sha256" ) -const maxRequestSize = 1 << 20 // 1 MiB - -func main() { - dir := flag.String("dir", ".", "Directory to store reports in") - dsn := flag.String("dsn", "", "Sentry DSN") - listen := flag.String("listen", ":22039", "HTTP listen address") - flag.Parse() - - cr := &crashReceiver{ - dir: *dir, - dsn: *dsn, - } - - log.SetOutput(os.Stdout) - if err := http.ListenAndServe(*listen, cr); err != nil { - log.Fatalln("HTTP serve:", err) - } -} - type crashReceiver struct { dir string dsn string @@ -155,8 +124,13 @@ func (r *crashReceiver) servePut(reportID, fullPath string, w http.ResponseWrite go func() { // There's no need for the client to have to wait for this part. - if err := sendReport(r.dsn, reportID, bs, user); err != nil { - log.Println("Failed to send report:", err) + pkt, err := parseCrashReport(reportID, bs) + if err != nil { + log.Println("Failed to parse crash report:", err) + return + } + if err := sendReport(r.dsn, pkt, user); err != nil { + log.Println("Failed to send crash report:", err) } }() } @@ -166,20 +140,3 @@ func (r *crashReceiver) servePut(reportID, fullPath string, w http.ResponseWrite func (r *crashReceiver) dirFor(base string) string { return filepath.Join(base[0:2], base[2:4]) } - -// userIDFor returns a string we can use as the user ID for the purpose of -// counting affected users. It's the truncated hash of a salt, the user -// remote IP, and the current month. -func userIDFor(req *http.Request) string { - addr := req.RemoteAddr - if fwd := req.Header.Get("x-forwarded-for"); fwd != "" { - addr = fwd - } - if host, _, err := net.SplitHostPort(addr); err == nil { - addr = host - } - now := time.Now().Format("200601") - salt := "stcrashreporter" - hash := sha256.Sum256([]byte(salt + addr + now)) - return fmt.Sprintf("%x", hash[:8]) -} diff --git a/lib/events/events.go b/lib/events/events.go index 687adb3ff..d672701e2 100644 --- a/lib/events/events.go +++ b/lib/events/events.go @@ -21,7 +21,7 @@ import ( "github.com/syncthing/syncthing/lib/util" ) -type EventType int +type EventType int64 const ( Starting EventType = 1 << iota @@ -52,6 +52,7 @@ const ( FolderWatchStateChanged ListenAddressesChanged LoginAttempt + Failure AllEvents = (1 << iota) - 1 ) @@ -121,6 +122,8 @@ func (t EventType) String() string { return "LoginAttempt" case FolderWatchStateChanged: return "FolderWatchStateChanged" + case Failure: + return "Failure" default: return "Unknown" } @@ -200,6 +203,8 @@ func UnmarshalEventType(s string) EventType { return LoginAttempt case "FolderWatchStateChanged": return FolderWatchStateChanged + case "Failure": + return Failure default: return 0 } diff --git a/lib/model/folder.go b/lib/model/folder.go index e77726c3c..9d80e1c20 100644 --- a/lib/model/folder.go +++ b/lib/model/folder.go @@ -865,11 +865,13 @@ func (f *folder) monitorWatch(ctx context.Context) { f.setWatchError(err, next) // This error was previously a panic and should never occur, so generate // a warning, but don't do it repetitively. - if !warnedOutside { - if _, ok := err.(*fs.ErrWatchEventOutsideRoot); ok { + var errOutside *fs.ErrWatchEventOutsideRoot + if errors.As(err, &errOutside) { + if !warnedOutside { l.Warnln(err) warnedOutside = true } + f.evLogger.Log(events.Failure, "watching for changes encountered an event outside of the filesystem root") } aggrCancel() errChan = nil diff --git a/lib/model/folder_sendonly.go b/lib/model/folder_sendonly.go index 3b03f1773..b9233ee4a 100644 --- a/lib/model/folder_sendonly.go +++ b/lib/model/folder_sendonly.go @@ -65,6 +65,7 @@ func (f *sendOnlyFolder) pull() bool { if !ok { if intf.IsDeleted() { l.Debugln("Should never get a deleted file as needed when we don't have it") + f.evLogger.Log(events.Failure, "got deleted file that doesn't exist locally as needed when pulling on send-only") } return true } diff --git a/lib/model/model.go b/lib/model/model.go index cf6884ec8..06c06ee89 100644 --- a/lib/model/model.go +++ b/lib/model/model.go @@ -1973,6 +1973,7 @@ func (s *indexSender) sendIndexTo(ctx context.Context) error { l.Warnln("Failed repairing sequence entries:", dbErr) panic("Failed repairing sequence entries") } else { + s.evLogger.Log(events.Failure, "detected and repaired non-increasing sequence") l.Infof("Repaired %v sequence entries in database", fixed) } }() diff --git a/lib/syncthing/syncthing.go b/lib/syncthing/syncthing.go index e77daf6f7..c855b4f47 100644 --- a/lib/syncthing/syncthing.go +++ b/lib/syncthing/syncthing.go @@ -133,6 +133,8 @@ func (a *App) Start() error { } func (a *App) startup() error { + a.mainService.Add(ur.NewFailureHandler(a.cfg, a.evLogger)) + a.mainService.Add(a.ll) if a.opts.AuditWriter != nil { diff --git a/lib/ur/failurereporting.go b/lib/ur/failurereporting.go new file mode 100644 index 000000000..e5eab99d8 --- /dev/null +++ b/lib/ur/failurereporting.go @@ -0,0 +1,188 @@ +// Copyright (C) 2020 The Syncthing Authors. +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this file, +// You can obtain one at https://mozilla.org/MPL/2.0/. + +package ur + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "time" + + "github.com/syncthing/syncthing/lib/build" + "github.com/syncthing/syncthing/lib/config" + "github.com/syncthing/syncthing/lib/dialer" + "github.com/syncthing/syncthing/lib/events" + "github.com/syncthing/syncthing/lib/util" + + "github.com/thejerf/suture" +) + +var ( + // When a specific failure first occurs, it is delayed by minDelay. If + // more of the same failures occurs those are further delayed and + // aggregated for maxDelay. + minDelay = 10 * time.Second + maxDelay = time.Minute + sendTimeout = time.Minute +) + +type FailureReport struct { + Description string + Count int + Version string +} + +type FailureHandler interface { + suture.Service + config.Committer +} + +func NewFailureHandler(cfg config.Wrapper, evLogger events.Logger) FailureHandler { + h := &failureHandler{ + cfg: cfg, + evLogger: evLogger, + optsChan: make(chan config.OptionsConfiguration), + } + h.Service = util.AsServiceWithError(h.serve, h.String()) + return h +} + +type failureHandler struct { + suture.Service + cfg config.Wrapper + evLogger events.Logger + optsChan chan config.OptionsConfiguration + evChan <-chan events.Event + buf map[string]*failureStat +} + +type failureStat struct { + first, last time.Time + count int +} + +func (h *failureHandler) serve(ctx context.Context) error { + go func() { + h.optsChan <- h.cfg.Options() + }() + h.cfg.Subscribe(h) + defer h.cfg.Unsubscribe(h) + + var url string + var err error + var sub events.Subscription + timer := time.NewTimer(minDelay) + resetTimer := make(chan struct{}) +outer: + for { + select { + case opts := <-h.optsChan: + // Sub nil checks just for safety - config updates can be racy. + if opts.URAccepted > 0 { + if sub == nil { + sub = h.evLogger.Subscribe(events.Failure) + h.evChan = sub.C() + } + } else if sub != nil { + sub.Unsubscribe() + sub = nil + } + url = opts.CRURL + "/failure" + case e := <-h.evChan: + descr := e.Data.(string) + if stat, ok := h.buf[descr]; ok { + stat.last = e.Time + stat.count++ + } else { + h.buf[descr] = &failureStat{ + first: e.Time, + last: e.Time, + count: 1, + } + } + case <-timer.C: + reports := make([]FailureReport, 0, len(h.buf)) + now := time.Now() + for descr, stat := range h.buf { + if now.Sub(stat.last) > minDelay || now.Sub(stat.first) > maxDelay { + reports = append(reports, FailureReport{ + Description: descr, + Count: stat.count, + Version: build.LongVersion, + }) + delete(h.buf, descr) + } + } + if len(reports) > 0 { + // Lets keep process events/configs while it might be timing out for a while + go func() { + sendFailureReports(ctx, reports, url) + select { + case resetTimer <- struct{}{}: + case <-ctx.Done(): + } + }() + } + case <-resetTimer: + timer.Reset(minDelay) + case <-ctx.Done(): + break outer + } + } + + if sub != nil { + sub.Unsubscribe() + } + return err +} + +func (h *failureHandler) VerifyConfiguration(_, _ config.Configuration) error { + return nil +} + +func (h *failureHandler) CommitConfiguration(from, to config.Configuration) bool { + if from.Options.CREnabled != to.Options.CREnabled || from.Options.CRURL != to.Options.CRURL { + h.optsChan <- to.Options + } + return true +} + +func (h *failureHandler) String() string { + return "FailureHandler" +} + +func sendFailureReports(ctx context.Context, reports []FailureReport, url string) { + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(reports); err != nil { + panic(err) + } + + client := &http.Client{ + Transport: &http.Transport{ + DialContext: dialer.DialContext, + Proxy: http.ProxyFromEnvironment, + }, + } + + reqCtx, reqCancel := context.WithTimeout(ctx, sendTimeout) + defer reqCancel() + req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, url, &b) + if err != nil { + l.Infoln("Failed to send failure report:", err) + return + } + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + if err != nil { + l.Infoln("Failed to send failure report:", err) + return + } + resp.Body.Close() + return +}