azureblob: add --azureblob-decompress flag to download gzip-encoded files

Before this change, if an object compressed with "Content-Encoding:
gzip" was downloaded, a length and hash mismatch would occur since the
go runtime automatically decompressed the object on download.

If --azureblob-decompress is set, this change erases the length and hash on
compressed objects so they can be downloaded successfully, at the cost
of not being able to check the length or the hash of the downloaded
object.

If --azureblob-decompress is not set the compressed files will be downloaded
as-is providing compressed objects with intact size and hash
information.

Fixes #9337
This commit is contained in:
Nick Craig-Wood
2026-04-13 11:48:32 +01:00
parent cf11c8bbd9
commit dd5250ca55
2 changed files with 135 additions and 9 deletions

View File

@@ -47,6 +47,7 @@ import (
"github.com/rclone/rclone/lib/multipart"
"github.com/rclone/rclone/lib/pacer"
"github.com/rclone/rclone/lib/pool"
"github.com/rclone/rclone/lib/readers"
"github.com/rclone/rclone/lib/transferaccounter"
"golang.org/x/sync/errgroup"
)
@@ -67,6 +68,16 @@ const (
sasCopyValidity = time.Hour // how long SAS should last when doing server side copy
)
// setAcceptEncodingGzip is a per-call policy that sets Accept-Encoding: gzip
// on every request. This prevents the Go HTTP transport from automatically
// decompressing gzip-encoded blobs on download.
type setAcceptEncodingGzip struct{}
func (p setAcceptEncodingGzip) Do(req *policy.Request) (*http.Response, error) {
req.Raw().Header.Set("Accept-Encoding", "gzip")
return req.Next()
}
var (
errCantUpdateArchiveTierBlobs = fserrors.NoRetryError(errors.New("can't update archive tier blob without --azureblob-archive-tier-delete"))
@@ -350,6 +361,19 @@ rclone does if you know the container exists already.
Default: "",
Exclusive: true,
Advanced: true,
}, {
Name: "decompress",
Help: `If set this will decompress gzip encoded objects.
It is possible to upload objects to Azure Blob Storage with "Content-Encoding: gzip"
set. Normally rclone will download these files as compressed objects.
If this flag is set then rclone will decompress these files with
"Content-Encoding: gzip" as they are received. This means that rclone
can't check the size and hash but the file contents will be decompressed.
`,
Advanced: true,
Default: false,
}}),
})
}
@@ -373,6 +397,7 @@ type Options struct {
NoCheckContainer bool `config:"no_check_container"`
NoHeadObject bool `config:"no_head_object"`
DeleteSnapshots string `config:"delete_snapshots"`
Decompress bool `config:"decompress"`
}
// Fs represents a remote azure server
@@ -397,6 +422,8 @@ type Fs struct {
copyToken *pacer.TokenDispenser // global multipart copy concurrency limiter
publicAccess container.PublicAccessType // Container Public Access Level
warnCompressed sync.Once // warn once about compressed files
// user delegation cache
userDelegationMu sync.Mutex
userDelegation *service.UserDelegationCredential
@@ -405,15 +432,16 @@ type Fs struct {
// Object describes an azure object
type Object struct {
fs *Fs // what this object is part of
remote string // The remote path
modTime time.Time // The modified time of the object if known
md5 string // MD5 hash if known
size int64 // Size of the object
mimeType string // Content-Type of the object
accessTier blob.AccessTier // Blob Access Tier
meta map[string]string // blob metadata - take metadataMu when accessing
tags map[string]string // blob tags
fs *Fs // what this object is part of
remote string // The remote path
modTime time.Time // The modified time of the object if known
md5 string // MD5 hash if known
size int64 // Size of the object
mimeType string // Content-Type of the object
accessTier blob.AccessTier // Blob Access Tier
meta map[string]string // blob metadata - take metadataMu when accessing
tags map[string]string // blob tags
contentEncoding *string // Content-Encoding of the object
}
// ------------------------------------------------------------
@@ -634,6 +662,9 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e
NewClientWithSharedKeyCredential: service.NewClientWithSharedKeyCredential,
NewSharedKeyCredential: service.NewSharedKeyCredential,
SetClientOptions: func(options *service.ClientOptions, policyClientOptions policy.ClientOptions) {
// Override the automatic decompression in the transport
// to download compressed files as-is
policyClientOptions.PerCallPolicies = append(policyClientOptions.PerCallPolicies, setAcceptEncodingGzip{})
options.ClientOptions = policyClientOptions
},
}
@@ -2001,6 +2032,10 @@ func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) {
if t != hash.MD5 {
return "", hash.ErrUnsupported
}
// If decompressing, erase the hash
if o.size < 0 {
return "", nil
}
// Convert base64 encoded md5 into lower case hex
if o.md5 == "" {
return "", nil
@@ -2126,6 +2161,13 @@ func (o *Object) decodeMetaDataFromPropertiesResponse(info *blob.GetPropertiesRe
o.accessTier = blob.AccessTier(*info.AccessTier)
}
o.setMetadata(metadata)
o.contentEncoding = info.ContentEncoding
// If decompressing then size and md5sum are unknown
if o.fs.opt.Decompress && o.contentEncoding != nil && *o.contentEncoding == "gzip" {
o.size = -1
o.md5 = ""
}
return nil
}
@@ -2178,6 +2220,13 @@ func (o *Object) decodeMetaDataFromDownloadResponse(info *blob.DownloadStreamRes
fs.Debugf(o, "Failed to find length in %q", contentRange)
}
}
o.contentEncoding = info.ContentEncoding
// If decompressing then size and md5sum are unknown
if o.fs.opt.Decompress && o.contentEncoding != nil && *o.contentEncoding == "gzip" {
o.size = -1
o.md5 = ""
}
return nil
}
@@ -2216,6 +2265,13 @@ func (o *Object) decodeMetaDataFromBlob(info *container.BlobItem) (err error) {
o.accessTier = *info.Properties.AccessTier
}
o.setMetadata(metadata)
o.contentEncoding = info.Properties.ContentEncoding
// If decompressing then size and md5sum are unknown
if o.fs.opt.Decompress && o.contentEncoding != nil && *o.contentEncoding == "gzip" {
o.size = -1
o.md5 = ""
}
return nil
}
@@ -2391,6 +2447,17 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
if err != nil {
return nil, fmt.Errorf("failed to decode metadata for download: %w", err)
}
// Decompress body if necessary
if downloadResponse.ContentEncoding != nil && *downloadResponse.ContentEncoding == "gzip" {
if o.fs.opt.Decompress {
return readers.NewGzipReader(downloadResponse.Body)
}
o.fs.warnCompressed.Do(func() {
fs.Logf(o, "Not decompressing 'Content-Encoding: gzip' compressed file. Use --azureblob-decompress to override")
})
}
return downloadResponse.Body, nil
}

View File

@@ -3,7 +3,10 @@
package azureblob
import (
"bytes"
"compress/gzip"
"context"
"crypto/md5"
"encoding/base64"
"fmt"
"net/http"
@@ -14,6 +17,7 @@ import (
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blockblob"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fs/object"
"github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/fstest/fstests"
@@ -150,10 +154,65 @@ func (f *Fs) testWriteUncommittedBlocks(t *testing.T) {
require.NoError(t, dst.Remove(ctx))
}
func gz(t *testing.T, s string) string {
var buf bytes.Buffer
zw := gzip.NewWriter(&buf)
_, err := zw.Write([]byte(s))
require.NoError(t, err)
err = zw.Close()
require.NoError(t, err)
return buf.String()
}
func md5sum(t *testing.T, s string) string {
hash := md5.Sum([]byte(s))
return fmt.Sprintf("%x", hash)
}
func (f *Fs) testGzipEncoding(t *testing.T) {
ctx := context.Background()
original := random.String(1000)
contents := gz(t, original)
item := fstest.NewItem("test-gzip", contents, fstest.Time("2001-05-06T04:05:06.499999999Z"))
metadata := fs.Metadata{
"content-encoding": "gzip",
"content-type": "text/plain",
}
obj := fstests.PutTestContentsMetadata(ctx, t, f, &item, true, contents, true, "text/html", metadata)
defer func() {
assert.NoError(t, obj.Remove(ctx))
}()
o := obj.(*Object)
// Test that the gzipped file we uploaded can be
// downloaded with and without decompression
checkDownload := func(wantContents string, wantSize int64, wantHash string) {
gotContents := fstests.ReadObject(ctx, t, o, -1)
assert.Equal(t, wantContents, gotContents)
assert.Equal(t, wantSize, o.Size())
gotHash, err := o.Hash(ctx, hash.MD5)
require.NoError(t, err)
assert.Equal(t, wantHash, gotHash)
}
t.Run("NoDecompress", func(t *testing.T) {
checkDownload(contents, int64(len(contents)), md5sum(t, contents))
})
t.Run("Decompress", func(t *testing.T) {
f.opt.Decompress = true
defer func() {
f.opt.Decompress = false
}()
checkDownload(original, -1, "")
})
}
func (f *Fs) InternalTest(t *testing.T) {
t.Run("Features", f.testFeatures)
t.Run("WriteUncommittedBlocks", f.testWriteUncommittedBlocks)
t.Run("Metadata", f.testMetadataPaths)
t.Run("GzipEncoding", f.testGzipEncoding)
}
// helper to read blob properties for an object