Fix --preallocate --sparse to actually produce sparse files

rsync.1 says combining --preallocate with --sparse yields sparse blocks
wherever the filesystem can punch holes, but since 2019 (commit c2da3809,
"keep file-size 0 when possible") it has silently left the file fully
allocated. Two problems, both rooted in that commit switching --preallocate /
--inplace to fallocate(FALLOC_FL_KEEP_SIZE):

  * do_fallocate() then returned 0 instead of the reserved length, so the
    receiver's preallocated_len was 0 and write_sparse() always lseek'd over
    null runs instead of punching them (and the over-preallocation trim in
    receiver.c never fired either).

  * more fundamentally, KEEP_SIZE leaves the file size at 0 while data is
    written incrementally, so the FALLOC_FL_PUNCH_HOLE call lands on blocks
    beyond EOF and is a silent no-op -- the reserved blocks are never freed.

Fix both: don't request KEEP_SIZE when --sparse is also active, so the file is
preallocated at full size and the punch lands within it; and return the
reserved length from do_fallocate() so preallocated_len drives the punch
decision and the over-allocation trim. --preallocate without --sparse keeps
the KEEP_SIZE (file-size-0) behaviour. t_stub.c gains a sparse_files stub since
do_fallocate now references it and the test helpers link syscall.o.

preallocate_test.py now asserts via st_blocks (where the filesystem can punch
holes) that --preallocate --sparse ends up sparse, guarding the regression.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Andrew Tridgell
2026-05-25 08:51:34 +10:00
parent bc63ea82f2
commit 4f5a5857ce
3 changed files with 70 additions and 6 deletions

View File

@@ -47,6 +47,7 @@ extern int read_only;
extern int list_only;
extern int inplace;
extern int preallocate_files;
extern int sparse_files;
extern int preserve_perms;
extern int preserve_executability;
extern int open_noatime;
@@ -1525,7 +1526,13 @@ int do_utime(const char *path, STRUCT_STAT *stp)
OFF_T do_fallocate(int fd, OFF_T offset, OFF_T length)
{
int opts = inplace || preallocate_files ? DO_FALLOC_OPTIONS : 0;
/* FALLOC_FL_KEEP_SIZE lets --preallocate/--inplace keep the file size at 0
* until data is written, but a later hole-punch (for --sparse) can only
* deallocate blocks that lie within the file's size -- with KEEP_SIZE the
* reserved blocks sit beyond EOF and the punch silently does nothing,
* leaving the file fully allocated. So when holes will also be punched,
* preallocate at full size instead (write_sparse then punches the nulls). */
int opts = (inplace || preallocate_files) && sparse_files <= 0 ? DO_FALLOC_OPTIONS : 0;
int ret;
RETURN_ERROR_IF(dry_run, 0);
RETURN_ERROR_IF_RO_OR_LO;
@@ -1550,7 +1557,14 @@ OFF_T do_fallocate(int fd, OFF_T offset, OFF_T length)
return length;
return st.st_blocks * S_BLKSIZE;
}
return 0;
/* With FALLOC_FL_KEEP_SIZE the blocks for [0, length) are reserved even
* though the file size stays put. Return that reserved length (not 0) so
* the caller's preallocated_len is meaningful: write_sparse() needs it to
* choose do_punch_hole() over a plain lseek() when turning a null run into
* a hole, and the receiver uses it to trim any over-preallocation. (A
* stray 0 here, from 2019's switch to KEEP_SIZE, is why --preallocate
* --sparse stopped producing sparse files.) */
return length;
}
#endif

View File

@@ -27,6 +27,7 @@ int am_daemon = 0;
int am_chrooted = 0;
int modify_window = 0;
int preallocate_files = 0;
int sparse_files = 0;
int protect_args = 0;
int module_id = -1;
int relative_paths = 0;

View File

@@ -3,21 +3,28 @@
do_fallocate (--preallocate) and do_punch_hole (sparse writes).
These are receiver-side file operations the resolver restructure also touches.
Where the filesystem lacks fallocate/punch-hole the calls warn and the transfer
still completes, so the content assertions hold regardless; the coverage is
gained wherever the kernel supports them.
Content must survive everywhere; in addition, where the filesystem stores holes,
--preallocate --sparse must end up sparse (st_blocks below the apparent size).
That is a regression guard: do_fallocate() must report the preallocated length
so write_sparse() punches holes in the reserved extent instead of lseek'ing over
it -- a stray 0 there silently left the file fully allocated.
"""
import os
from rsyncfns import (
FROMDIR, TODIR,
assert_same, make_data_file, makepath, rmtree, run_rsync, test_skipped,
assert_same, make_data_file, makepath, rmtree, run_rsync, test_fail,
test_skipped,
)
src = FROMDIR
deep = os.path.join('d1', 'd2', 'd3', 'f')
def allocated(path):
return os.stat(path).st_blocks * 512
# --preallocate needs fallocate/posix_fallocate, and do_punch_hole needs
# FALLOC_FL_PUNCH_HOLE -- both Linux (and Cygwin) features. macOS, the *BSDs and
# Solaris build without preallocation and reject the option outright ("prealloc-
@@ -31,6 +38,40 @@ if run_rsync('-a', '--preallocate', f'{src}/', f'{TODIR}/',
check=False, capture_output=True).returncode != 0:
test_skipped("--preallocate not supported on this platform")
def fs_can_punch_holes():
"""True only where the kernel can deallocate blocks via FALLOC_FL_PUNCH_HOLE
-- the mechanism do_punch_hole uses for --sparse. A filesystem may report
seek-based sparseness yet still keep every block on a punch (e.g. where
rsync's punch falls back to writing zeros), so probe the real capability and
assert the hole only where it actually frees blocks."""
import ctypes
import ctypes.util
KEEP_SIZE, PUNCH_HOLE = 0x01, 0x02
p = src / 'punch-probe'
fd = -1
try:
libc = ctypes.CDLL(ctypes.util.find_library('c') or 'libc.so.6',
use_errno=True)
libc.fallocate.argtypes = [ctypes.c_int, ctypes.c_int,
ctypes.c_longlong, ctypes.c_longlong]
fd = os.open(p, os.O_CREAT | os.O_RDWR | os.O_TRUNC, 0o644)
os.write(fd, b'\xff' * 65536)
before = os.fstat(fd).st_blocks
ret = libc.fallocate(fd, PUNCH_HOLE | KEEP_SIZE, 0, 65536)
return ret == 0 and os.fstat(fd).st_blocks < before
except (OSError, AttributeError, ValueError):
return False
finally:
if fd >= 0:
os.close(fd)
try:
os.unlink(p)
except OSError:
pass
can_punch = fs_can_punch_holes()
def seed_plain(size=1_000_000):
rmtree(src)
@@ -55,9 +96,17 @@ run_rsync('-a', '--preallocate', f'{src}/', f'{TODIR}/')
assert_same(TODIR / deep, src / deep, label='--preallocate content')
# --- --preallocate --sparse on a holey file: do_fallocate + do_punch_hole ---
# rsync.1 promises sparse blocks for this combination where the FS supports
# holes. Assert it: do_fallocate reserves the whole extent, then the zero run
# must be punched back out (st_blocks well below the apparent size).
seed_holey()
run_rsync('-a', '--preallocate', '--sparse', f'{src}/', f'{TODIR}/')
assert_same(TODIR / deep, src / deep, label='--preallocate --sparse content')
if can_punch and allocated(TODIR / deep) >= os.path.getsize(TODIR / deep):
test_fail(f"--preallocate --sparse left the file fully allocated "
f"(allocated {allocated(TODIR / deep)} for a "
f"{os.path.getsize(TODIR / deep)}-byte file); the preallocated "
"extent's zero run was not punched into a hole")
# --- --inplace --sparse update that introduces a zero run: do_punch_hole ----
# (sparse_end's updating_basis_or_equiv branch punches the hole in place.)