From 1a20d82edccc8cfdd2ef800b8ba951151dcce609 Mon Sep 17 00:00:00 2001 From: Danish Prakash Date: Thu, 9 Oct 2025 13:20:13 +0530 Subject: [PATCH] libpod: replace listen with socket+bind for dual-stack port reservation This commit addresses two concerns. Bind dual stack when hostIP unless it is explicitly specified. Since we use listen(), this change resulted in blocked connections on stacks without matching DNAT rules (e.g. connecting to [::1] on an IPv4-only network) because the TCP handshake on the reservation socket would complete instead of returning ECONNREFUSED allowing the client to fallback to IPv4. Replacing listen() with raw socket() and bind() syscalls fixes this by allowing us to reserve this port without accepting connections; clients get ECONNREFUSED and fall back to IPv4 automatically, as is desired. Fixes: https://github.com/containers/netavark/issues/1338 Co-authored-by: Paul Holzinger Signed-off-by: Danish Prakash --- libpod/oci_util.go | 137 +++++++++++++++++-------------- test/system/252-quadlet.bats | 2 +- test/system/500-networking.bats | 24 +++++- test/system/helpers.network.bash | 84 ++----------------- test/system/helpers.t | 17 ---- 5 files changed, 104 insertions(+), 160 deletions(-) diff --git a/libpod/oci_util.go b/libpod/oci_util.go index d19addfa9f..bc316b5a74 100644 --- a/libpod/oci_util.go +++ b/libpod/oci_util.go @@ -3,16 +3,19 @@ package libpod import ( + "errors" "fmt" "net" "os" "regexp" + "strconv" "strings" "time" "github.com/sirupsen/logrus" "go.podman.io/common/libnetwork/types" "go.podman.io/podman/v6/libpod/define" + "golang.org/x/sys/unix" ) // Timeout before declaring that runtime has failed to kill a given @@ -32,9 +35,9 @@ func bindPorts(ports []types.PortMapping) ([]*os.File, error) { var files []*os.File sctpWarning := true for _, port := range ports { - isV6 := net.ParseIP(port.HostIP).To4() == nil - if port.HostIP == "" { - isV6 = false + isV6 := false + if port.HostIP != "" { + isV6 = net.ParseIP(port.HostIP).To4() == nil } protocols := strings.SplitSeq(port.Protocol, ",") for protocol := range protocols { @@ -42,7 +45,7 @@ func bindPorts(ports []types.PortMapping) ([]*os.File, error) { f, err := bindPort(protocol, port.HostIP, port.HostPort+i, isV6, &sctpWarning) if err != nil { // close all open ports in case of early error so we do not - // rely garbage collector to close them + // rely on the garbage collector to close them for _, f := range files { f.Close() } @@ -57,84 +60,96 @@ func bindPorts(ports []types.PortMapping) ([]*os.File, error) { return files, nil } +// bindPort reserves a port on the host using socket+bind without listen. +// Dual-stack bind by default unless hostIP is specified. func bindPort(protocol, hostIP string, port uint16, isV6 bool, sctpWarning *bool) (*os.File, error) { - var file *os.File switch protocol { - case "udp": - var ( - addr *net.UDPAddr - err error - ) - if isV6 { - addr, err = net.ResolveUDPAddr("udp6", fmt.Sprintf("[%s]:%d", hostIP, port)) - } else { - addr, err = net.ResolveUDPAddr("udp4", fmt.Sprintf("%s:%d", hostIP, port)) - } - if err != nil { - return nil, fmt.Errorf("cannot resolve the UDP address: %w", err) + case "tcp", "udp": + sockType := unix.SOCK_STREAM + if protocol == "udp" { + sockType = unix.SOCK_DGRAM } - proto := "udp4" - if isV6 { - proto = "udp6" - } - server, err := net.ListenUDP(proto, addr) + domain, sa, err := buildSockAddr(hostIP, port, isV6) if err != nil { - return nil, fmt.Errorf("cannot listen on the UDP port: %w", err) - } - file, err = server.File() - if err != nil { - return nil, fmt.Errorf("cannot get file for UDP socket: %w", err) - } - // close the listener - // note that this does not affect the fd, see the godoc for server.File() - err = server.Close() - if err != nil { - logrus.Warnf("Failed to close connection: %v", err) + return nil, err } - case "tcp": - var ( - addr *net.TCPAddr - err error - ) - if isV6 { - addr, err = net.ResolveTCPAddr("tcp6", fmt.Sprintf("[%s]:%d", hostIP, port)) - } else { - addr, err = net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", hostIP, port)) - } + fd, err := unix.Socket(domain, sockType|unix.SOCK_CLOEXEC, 0) if err != nil { - return nil, fmt.Errorf("cannot resolve the TCP address: %w", err) + // If hostIP == "" and IPv6 is not supported, fall back to IPv4 + if hostIP == "" && errors.Is(err, unix.EAFNOSUPPORT) { + return bindPortV4Fallback(protocol, sockType, port) + } + return nil, fmt.Errorf("cannot create socket for %s port %d: %w", protocol, port, err) } - proto := "tcp4" - if isV6 { - proto = "tcp6" + if err := setupSocketOpts(fd, domain, hostIP); err != nil { + unix.Close(fd) + return nil, err } - server, err := net.ListenTCP(proto, addr) - if err != nil { - return nil, fmt.Errorf("cannot listen on the TCP port: %w", err) - } - file, err = server.File() - if err != nil { - return nil, fmt.Errorf("cannot get file for TCP socket: %w", err) - } - // close the listener - // note that this does not affect the fd, see the godoc for server.File() - err = server.Close() - if err != nil { - logrus.Warnf("Failed to close connection: %v", err) + + if err := unix.Bind(fd, sa); err != nil { + unix.Close(fd) + return nil, fmt.Errorf("cannot bind %s port %s: %w", protocol, net.JoinHostPort(hostIP, strconv.FormatUint(uint64(port), 10)), err) } + return os.NewFile(uintptr(fd), fmt.Sprintf("reservation-%s-%d", protocol, port)), nil + case "sctp": if *sctpWarning { logrus.Info("Port reservation for SCTP is not supported") *sctpWarning = false } + return nil, nil default: return nil, fmt.Errorf("unknown protocol %s", protocol) } - return file, nil +} + +func buildSockAddr(hostIP string, port uint16, isV6 bool) (int, unix.Sockaddr, error) { + // default behaviour when hostIP == "" is to bind dual-stack + // if hostIP != ""; determine the stack using the address specified + if hostIP == "" { + return unix.AF_INET6, &unix.SockaddrInet6{Port: int(port)}, nil + } + ip := net.ParseIP(hostIP) + if ip == nil { + return 0, nil, fmt.Errorf("invalid IP address: %s", hostIP) + } + if isV6 { + sa := &unix.SockaddrInet6{Port: int(port)} + copy(sa.Addr[:], ip.To16()) + return unix.AF_INET6, sa, nil + } + sa := &unix.SockaddrInet4{Port: int(port)} + copy(sa.Addr[:], ip.To4()) + return unix.AF_INET, sa, nil +} + +func setupSocketOpts(fd, domain int, hostIP string) error { + if domain == unix.AF_INET6 { + v6only := 1 + if hostIP == "" { + v6only = 0 + } + if err := unix.SetsockoptInt(fd, unix.IPPROTO_IPV6, unix.IPV6_V6ONLY, v6only); err != nil { + return fmt.Errorf("cannot set IPV6_V6ONLY: %w", err) + } + } + return nil +} + +func bindPortV4Fallback(protocol string, sockType int, port uint16) (*os.File, error) { + fd, err := unix.Socket(unix.AF_INET, sockType|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("cannot create socket for %s port %d: %w", protocol, port, err) + } + if err := unix.Bind(fd, &unix.SockaddrInet4{Port: int(port)}); err != nil { + unix.Close(fd) + return nil, fmt.Errorf("cannot bind %s port %s: %w", protocol, net.JoinHostPort("", strconv.FormatUint(uint64(port), 10)), err) + } + return os.NewFile(uintptr(fd), fmt.Sprintf("reservation-%s-%d", protocol, port)), nil } func getOCIRuntimeError(name, runtimeMsg string) error { diff --git a/test/system/252-quadlet.bats b/test/system/252-quadlet.bats index a0fb0598ee..fb08979924 100644 --- a/test/system/252-quadlet.bats +++ b/test/system/252-quadlet.bats @@ -1343,7 +1343,7 @@ EOF echo "$_LOG_PROMPT journalctl -u $QUADLET_SERVICE_NAME" run -0 journalctl -eu $QUADLET_SERVICE_NAME - assert "$output" =~ "$port: bind: address already in use" "journal contains the real podman start error" + assert "$output" =~ "$port.*ddress already in use" "journal contains the real podman start error" kill "$socat_pid" } diff --git a/test/system/500-networking.bats b/test/system/500-networking.bats index e3cbd23682..f0755862e1 100644 --- a/test/system/500-networking.bats +++ b/test/system/500-networking.bats @@ -341,9 +341,11 @@ function run_pod_etc_hosts_test(){ # flush the firewall rule here to break port forwarding nft delete table inet netavark || true - # check that we cannot curl (timeout after 1 sec) + # check that we cannot curl: expect either connection refused (7) + # or timeout (28) depending on whether the port reservation socket + # is in listen state run curl --max-time 1 -s $SERVER/index.txt - assert $status -eq 28 "curl did not time out" + assert "$status" -ne 0 "curl should fail after flushing nftables" fi # reload the network to recreate the nftables rules @@ -1138,4 +1140,22 @@ EOF run_podman network rm $netname2 } +# Refer https://github.com/containers/netavark/issues/1338 +# bats test_tags=ci:parallel +@test "podman run - dual-stack conflicts with explicit wildcards" { + myport=$(random_free_port) + cname1="c1-$(safename)" + + run_podman run -d --name $cname1 -p $myport:8080 $IMAGE sleep inf + cid1="$output" + + run_podman 126 run --rm -p 0.0.0.0:$myport:8080 $IMAGE true + assert "$output" =~ "ddress already in use" "explicit IPv4 wildcard should conflict with dual-stack" + + run_podman 126 run --rm -p [::]:$myport:8080 $IMAGE true + assert "$output" =~ "ddress already in use" "explicit IPv6 wildcard should conflict with dual-stack" + + run_podman rm -f -t0 $cid1 +} + # vim: filetype=sh diff --git a/test/system/helpers.network.bash b/test/system/helpers.network.bash index 5eecb7c230..ebfffcaa64 100644 --- a/test/system/helpers.network.bash +++ b/test/system/helpers.network.bash @@ -30,58 +30,6 @@ function skip_if_no_ipv6() { fi } -### procfs access ############################################################## - -# ipv6_to_procfs() - RFC 5952 IPv6 address text representation to procfs format -# $1: Address in any notation described by RFC 5952 -function ipv6_to_procfs() { - local addr="${1}" - - # Add leading zero if missing - case ${addr} in - "::"*) addr=0"${addr}" ;; - esac - - # Double colon can mean any number of all-zero fields. Expand to fill - # as many colons as are missing. (This will not be a valid IPv6 form, - # but we don't need it for long). E.g., 0::1 -> 0:::::::1 - case ${addr} in - *"::"*) - # All the colons in the address - local colons - colons=$(tr -dc : <<<$addr) - # subtract those from a string of eight colons; this gives us - # a string of two to six colons... - local pad - pad=$(sed -e "s/$colons//" <<<":::::::") - # ...which we then inject in place of the double colon. - addr=$(sed -e "s/::/::$pad/" <<<$addr) - ;; - esac - - # Print as a contiguous string of zero-filled 16-bit words - # (The additional ":" below is needed because 'read -d x' actually - # means "x is a TERMINATOR, not a delimiter") - local group - while read -d : group; do - printf "%04X" "0x${group:-0}" - done <<<"${addr}:" -} - -# __ipv4_to_procfs() - Print bytes in hexadecimal notation reversing arguments -# $@: IPv4 address as separate bytes -function __ipv4_to_procfs() { - printf "%02X%02X%02X%02X" ${4} ${3} ${2} ${1} -} - -# ipv4_to_procfs() - IPv4 address representation to big-endian procfs format -# $1: Text representation of IPv4 address -function ipv4_to_procfs() { - IFS='.' read -r o1 o2 o3 o4 <<< $1 - __ipv4_to_procfs $o1 $o2 $o3 $o4 -} - - ### Addresses, Routes, Links ################################################### # ipv4_get_addr_global() - Print first global IPv4 address reported by netlink @@ -259,7 +207,7 @@ function unreserve_port() { local port=$1 local lockfile=$PORT_LOCK_DIR/$port - -e $lockfile || die "Cannot unreserve non-reserved port $port" + test -e $lockfile || die "Cannot unreserve non-reserved port $port" assert "$(< $lockfile)" = "$BATS_SUITE_TEST_NUMBER" \ "Port $port is not reserved by this test" rm -f $lockfile @@ -348,32 +296,10 @@ function port_is_bound() { local proto="tcp" fi - # /proc/net/tcp is insufficient: it does not show some rootless ports. - # ss does, so check it first. - run ss -${proto:0:1}nlH sport = $port - if [[ -n "$output" ]]; then - return - fi - - port=$(printf %04X ${port}) - case "${address}" in - *":"*) - grep -e "^[^:]*: $(ipv6_to_procfs "${address}"):${port} .*" \ - -e "^[^:]*: $(ipv6_to_procfs "::0"):${port} .*" \ - -q "/proc/net/${proto}6" - ;; - *"."*) - grep -e "^[^:]*: $(ipv4_to_procfs "${address}"):${port}" \ - -e "^[^:]*: $(ipv4_to_procfs "0.0.0.0"):${port}" \ - -e "^[^:]*: $(ipv4_to_procfs "127.0.0.1"):${port}" \ - -q "/proc/net/${proto}" - ;; - *) - # No address: check both IPv4 and IPv6, for any bound address - grep "^[^:]*: [^:]*:${port} .*" -q "/proc/net/${proto}6" || \ - grep "^[^:]*: [^:]*:${port} .*" -q "/proc/net/${proto}" - ;; - esac + # Use ss to check the local ports + run -0 ss -${proto:0:1}nlH state all sport = $port + # grep for exact address:port match or for "*:port" which means the port is bound to all addresses and hence bound for any address + grep -q "$address:$port" <<<"$output" || grep -q "*:$port" <<<"$output" } # port_is_free() - Check if TCP or UDP port is free to bind for a given address diff --git a/test/system/helpers.t b/test/system/helpers.t index 8c4d892369..74f24eef3b 100755 --- a/test/system/helpers.t +++ b/test/system/helpers.t @@ -232,23 +232,6 @@ check_result "$found" "16700" "random_free_port" # END random_free_port ############################################################################### -# BEGIN ipv6_to_procfs - -# Table of IPv6 short forms and their procfs equivalents. For readability, -# spaces separate each 16-bit word. Spaces are removed when testing. -table=" -2b06::1 | 2B06 0000 0000 0000 0000 0000 0000 0001 -::1 | 0000 0000 0000 0000 0000 0000 0000 0001 -0::1 | 0000 0000 0000 0000 0000 0000 0000 0001 -" - -while read shortform expect; do - actual=$(ipv6_to_procfs $shortform) - check_result "$actual" "${expect// }" "ipv6_to_procfs $shortform" -done < <(parse_table "$table") - -# END ipv6_to_procfs -############################################################################### # BEGIN subnet_in_use ... because that's complicated # Override ip command