From dd5250ca55f1e5a5cc407ee768fb7f3f76b1df41 Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Mon, 13 Apr 2026 11:48:32 +0100 Subject: [PATCH] azureblob: add --azureblob-decompress flag to download gzip-encoded files Before this change, if an object compressed with "Content-Encoding: gzip" was downloaded, a length and hash mismatch would occur since the go runtime automatically decompressed the object on download. If --azureblob-decompress is set, this change erases the length and hash on compressed objects so they can be downloaded successfully, at the cost of not being able to check the length or the hash of the downloaded object. If --azureblob-decompress is not set the compressed files will be downloaded as-is providing compressed objects with intact size and hash information. Fixes #9337 --- backend/azureblob/azureblob.go | 85 +++++++++++++++++--- backend/azureblob/azureblob_internal_test.go | 59 ++++++++++++++ 2 files changed, 135 insertions(+), 9 deletions(-) diff --git a/backend/azureblob/azureblob.go b/backend/azureblob/azureblob.go index 0740b8339..11b68521c 100644 --- a/backend/azureblob/azureblob.go +++ b/backend/azureblob/azureblob.go @@ -47,6 +47,7 @@ import ( "github.com/rclone/rclone/lib/multipart" "github.com/rclone/rclone/lib/pacer" "github.com/rclone/rclone/lib/pool" + "github.com/rclone/rclone/lib/readers" "github.com/rclone/rclone/lib/transferaccounter" "golang.org/x/sync/errgroup" ) @@ -67,6 +68,16 @@ const ( sasCopyValidity = time.Hour // how long SAS should last when doing server side copy ) +// setAcceptEncodingGzip is a per-call policy that sets Accept-Encoding: gzip +// on every request. This prevents the Go HTTP transport from automatically +// decompressing gzip-encoded blobs on download. +type setAcceptEncodingGzip struct{} + +func (p setAcceptEncodingGzip) Do(req *policy.Request) (*http.Response, error) { + req.Raw().Header.Set("Accept-Encoding", "gzip") + return req.Next() +} + var ( errCantUpdateArchiveTierBlobs = fserrors.NoRetryError(errors.New("can't update archive tier blob without --azureblob-archive-tier-delete")) @@ -350,6 +361,19 @@ rclone does if you know the container exists already. Default: "", Exclusive: true, Advanced: true, + }, { + Name: "decompress", + Help: `If set this will decompress gzip encoded objects. + +It is possible to upload objects to Azure Blob Storage with "Content-Encoding: gzip" +set. Normally rclone will download these files as compressed objects. + +If this flag is set then rclone will decompress these files with +"Content-Encoding: gzip" as they are received. This means that rclone +can't check the size and hash but the file contents will be decompressed. +`, + Advanced: true, + Default: false, }}), }) } @@ -373,6 +397,7 @@ type Options struct { NoCheckContainer bool `config:"no_check_container"` NoHeadObject bool `config:"no_head_object"` DeleteSnapshots string `config:"delete_snapshots"` + Decompress bool `config:"decompress"` } // Fs represents a remote azure server @@ -397,6 +422,8 @@ type Fs struct { copyToken *pacer.TokenDispenser // global multipart copy concurrency limiter publicAccess container.PublicAccessType // Container Public Access Level + warnCompressed sync.Once // warn once about compressed files + // user delegation cache userDelegationMu sync.Mutex userDelegation *service.UserDelegationCredential @@ -405,15 +432,16 @@ type Fs struct { // Object describes an azure object type Object struct { - fs *Fs // what this object is part of - remote string // The remote path - modTime time.Time // The modified time of the object if known - md5 string // MD5 hash if known - size int64 // Size of the object - mimeType string // Content-Type of the object - accessTier blob.AccessTier // Blob Access Tier - meta map[string]string // blob metadata - take metadataMu when accessing - tags map[string]string // blob tags + fs *Fs // what this object is part of + remote string // The remote path + modTime time.Time // The modified time of the object if known + md5 string // MD5 hash if known + size int64 // Size of the object + mimeType string // Content-Type of the object + accessTier blob.AccessTier // Blob Access Tier + meta map[string]string // blob metadata - take metadataMu when accessing + tags map[string]string // blob tags + contentEncoding *string // Content-Encoding of the object } // ------------------------------------------------------------ @@ -634,6 +662,9 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e NewClientWithSharedKeyCredential: service.NewClientWithSharedKeyCredential, NewSharedKeyCredential: service.NewSharedKeyCredential, SetClientOptions: func(options *service.ClientOptions, policyClientOptions policy.ClientOptions) { + // Override the automatic decompression in the transport + // to download compressed files as-is + policyClientOptions.PerCallPolicies = append(policyClientOptions.PerCallPolicies, setAcceptEncodingGzip{}) options.ClientOptions = policyClientOptions }, } @@ -2001,6 +2032,10 @@ func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) { if t != hash.MD5 { return "", hash.ErrUnsupported } + // If decompressing, erase the hash + if o.size < 0 { + return "", nil + } // Convert base64 encoded md5 into lower case hex if o.md5 == "" { return "", nil @@ -2126,6 +2161,13 @@ func (o *Object) decodeMetaDataFromPropertiesResponse(info *blob.GetPropertiesRe o.accessTier = blob.AccessTier(*info.AccessTier) } o.setMetadata(metadata) + o.contentEncoding = info.ContentEncoding + + // If decompressing then size and md5sum are unknown + if o.fs.opt.Decompress && o.contentEncoding != nil && *o.contentEncoding == "gzip" { + o.size = -1 + o.md5 = "" + } return nil } @@ -2178,6 +2220,13 @@ func (o *Object) decodeMetaDataFromDownloadResponse(info *blob.DownloadStreamRes fs.Debugf(o, "Failed to find length in %q", contentRange) } } + o.contentEncoding = info.ContentEncoding + + // If decompressing then size and md5sum are unknown + if o.fs.opt.Decompress && o.contentEncoding != nil && *o.contentEncoding == "gzip" { + o.size = -1 + o.md5 = "" + } return nil } @@ -2216,6 +2265,13 @@ func (o *Object) decodeMetaDataFromBlob(info *container.BlobItem) (err error) { o.accessTier = *info.Properties.AccessTier } o.setMetadata(metadata) + o.contentEncoding = info.Properties.ContentEncoding + + // If decompressing then size and md5sum are unknown + if o.fs.opt.Decompress && o.contentEncoding != nil && *o.contentEncoding == "gzip" { + o.size = -1 + o.md5 = "" + } return nil } @@ -2391,6 +2447,17 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read if err != nil { return nil, fmt.Errorf("failed to decode metadata for download: %w", err) } + + // Decompress body if necessary + if downloadResponse.ContentEncoding != nil && *downloadResponse.ContentEncoding == "gzip" { + if o.fs.opt.Decompress { + return readers.NewGzipReader(downloadResponse.Body) + } + o.fs.warnCompressed.Do(func() { + fs.Logf(o, "Not decompressing 'Content-Encoding: gzip' compressed file. Use --azureblob-decompress to override") + }) + } + return downloadResponse.Body, nil } diff --git a/backend/azureblob/azureblob_internal_test.go b/backend/azureblob/azureblob_internal_test.go index b2fc0f7d5..4d52a2fa9 100644 --- a/backend/azureblob/azureblob_internal_test.go +++ b/backend/azureblob/azureblob_internal_test.go @@ -3,7 +3,10 @@ package azureblob import ( + "bytes" + "compress/gzip" "context" + "crypto/md5" "encoding/base64" "fmt" "net/http" @@ -14,6 +17,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob" "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blockblob" "github.com/rclone/rclone/fs" + "github.com/rclone/rclone/fs/hash" "github.com/rclone/rclone/fs/object" "github.com/rclone/rclone/fstest" "github.com/rclone/rclone/fstest/fstests" @@ -150,10 +154,65 @@ func (f *Fs) testWriteUncommittedBlocks(t *testing.T) { require.NoError(t, dst.Remove(ctx)) } +func gz(t *testing.T, s string) string { + var buf bytes.Buffer + zw := gzip.NewWriter(&buf) + _, err := zw.Write([]byte(s)) + require.NoError(t, err) + err = zw.Close() + require.NoError(t, err) + return buf.String() +} + +func md5sum(t *testing.T, s string) string { + hash := md5.Sum([]byte(s)) + return fmt.Sprintf("%x", hash) +} + +func (f *Fs) testGzipEncoding(t *testing.T) { + ctx := context.Background() + original := random.String(1000) + contents := gz(t, original) + + item := fstest.NewItem("test-gzip", contents, fstest.Time("2001-05-06T04:05:06.499999999Z")) + metadata := fs.Metadata{ + "content-encoding": "gzip", + "content-type": "text/plain", + } + obj := fstests.PutTestContentsMetadata(ctx, t, f, &item, true, contents, true, "text/html", metadata) + defer func() { + assert.NoError(t, obj.Remove(ctx)) + }() + o := obj.(*Object) + + // Test that the gzipped file we uploaded can be + // downloaded with and without decompression + checkDownload := func(wantContents string, wantSize int64, wantHash string) { + gotContents := fstests.ReadObject(ctx, t, o, -1) + assert.Equal(t, wantContents, gotContents) + assert.Equal(t, wantSize, o.Size()) + gotHash, err := o.Hash(ctx, hash.MD5) + require.NoError(t, err) + assert.Equal(t, wantHash, gotHash) + } + + t.Run("NoDecompress", func(t *testing.T) { + checkDownload(contents, int64(len(contents)), md5sum(t, contents)) + }) + t.Run("Decompress", func(t *testing.T) { + f.opt.Decompress = true + defer func() { + f.opt.Decompress = false + }() + checkDownload(original, -1, "") + }) +} + func (f *Fs) InternalTest(t *testing.T) { t.Run("Features", f.testFeatures) t.Run("WriteUncommittedBlocks", f.testWriteUncommittedBlocks) t.Run("Metadata", f.testMetadataPaths) + t.Run("GzipEncoding", f.testGzipEncoding) } // helper to read blob properties for an object