libpod: replace listen with socket+bind for dual-stack port reservation

This commit addresses two concerns.

Bind dual stack when hostIP unless it is explicitly specified. Since we
use listen(), this change resulted in blocked connections on stacks
without matching DNAT rules (e.g. connecting to [::1] on an IPv4-only
network) because the TCP handshake on the reservation socket would complete
instead of returning ECONNREFUSED allowing the client to fallback to IPv4.

Replacing listen() with raw socket() and bind() syscalls fixes this by
allowing us to reserve this port without accepting connections; clients
get ECONNREFUSED and fall back to IPv4 automatically, as is desired.

Fixes: https://github.com/containers/netavark/issues/1338

Co-authored-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Danish Prakash <contact@danishpraka.sh>
This commit is contained in:
Danish Prakash
2025-10-09 13:20:13 +05:30
parent ef2117a158
commit 1a20d82edc
5 changed files with 104 additions and 160 deletions

View File

@@ -3,16 +3,19 @@
package libpod
import (
"errors"
"fmt"
"net"
"os"
"regexp"
"strconv"
"strings"
"time"
"github.com/sirupsen/logrus"
"go.podman.io/common/libnetwork/types"
"go.podman.io/podman/v6/libpod/define"
"golang.org/x/sys/unix"
)
// Timeout before declaring that runtime has failed to kill a given
@@ -32,9 +35,9 @@ func bindPorts(ports []types.PortMapping) ([]*os.File, error) {
var files []*os.File
sctpWarning := true
for _, port := range ports {
isV6 := net.ParseIP(port.HostIP).To4() == nil
if port.HostIP == "" {
isV6 = false
isV6 := false
if port.HostIP != "" {
isV6 = net.ParseIP(port.HostIP).To4() == nil
}
protocols := strings.SplitSeq(port.Protocol, ",")
for protocol := range protocols {
@@ -42,7 +45,7 @@ func bindPorts(ports []types.PortMapping) ([]*os.File, error) {
f, err := bindPort(protocol, port.HostIP, port.HostPort+i, isV6, &sctpWarning)
if err != nil {
// close all open ports in case of early error so we do not
// rely garbage collector to close them
// rely on the garbage collector to close them
for _, f := range files {
f.Close()
}
@@ -57,84 +60,96 @@ func bindPorts(ports []types.PortMapping) ([]*os.File, error) {
return files, nil
}
// bindPort reserves a port on the host using socket+bind without listen.
// Dual-stack bind by default unless hostIP is specified.
func bindPort(protocol, hostIP string, port uint16, isV6 bool, sctpWarning *bool) (*os.File, error) {
var file *os.File
switch protocol {
case "udp":
var (
addr *net.UDPAddr
err error
)
if isV6 {
addr, err = net.ResolveUDPAddr("udp6", fmt.Sprintf("[%s]:%d", hostIP, port))
} else {
addr, err = net.ResolveUDPAddr("udp4", fmt.Sprintf("%s:%d", hostIP, port))
}
if err != nil {
return nil, fmt.Errorf("cannot resolve the UDP address: %w", err)
case "tcp", "udp":
sockType := unix.SOCK_STREAM
if protocol == "udp" {
sockType = unix.SOCK_DGRAM
}
proto := "udp4"
if isV6 {
proto = "udp6"
}
server, err := net.ListenUDP(proto, addr)
domain, sa, err := buildSockAddr(hostIP, port, isV6)
if err != nil {
return nil, fmt.Errorf("cannot listen on the UDP port: %w", err)
}
file, err = server.File()
if err != nil {
return nil, fmt.Errorf("cannot get file for UDP socket: %w", err)
}
// close the listener
// note that this does not affect the fd, see the godoc for server.File()
err = server.Close()
if err != nil {
logrus.Warnf("Failed to close connection: %v", err)
return nil, err
}
case "tcp":
var (
addr *net.TCPAddr
err error
)
if isV6 {
addr, err = net.ResolveTCPAddr("tcp6", fmt.Sprintf("[%s]:%d", hostIP, port))
} else {
addr, err = net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", hostIP, port))
}
fd, err := unix.Socket(domain, sockType|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, fmt.Errorf("cannot resolve the TCP address: %w", err)
// If hostIP == "" and IPv6 is not supported, fall back to IPv4
if hostIP == "" && errors.Is(err, unix.EAFNOSUPPORT) {
return bindPortV4Fallback(protocol, sockType, port)
}
return nil, fmt.Errorf("cannot create socket for %s port %d: %w", protocol, port, err)
}
proto := "tcp4"
if isV6 {
proto = "tcp6"
if err := setupSocketOpts(fd, domain, hostIP); err != nil {
unix.Close(fd)
return nil, err
}
server, err := net.ListenTCP(proto, addr)
if err != nil {
return nil, fmt.Errorf("cannot listen on the TCP port: %w", err)
}
file, err = server.File()
if err != nil {
return nil, fmt.Errorf("cannot get file for TCP socket: %w", err)
}
// close the listener
// note that this does not affect the fd, see the godoc for server.File()
err = server.Close()
if err != nil {
logrus.Warnf("Failed to close connection: %v", err)
if err := unix.Bind(fd, sa); err != nil {
unix.Close(fd)
return nil, fmt.Errorf("cannot bind %s port %s: %w", protocol, net.JoinHostPort(hostIP, strconv.FormatUint(uint64(port), 10)), err)
}
return os.NewFile(uintptr(fd), fmt.Sprintf("reservation-%s-%d", protocol, port)), nil
case "sctp":
if *sctpWarning {
logrus.Info("Port reservation for SCTP is not supported")
*sctpWarning = false
}
return nil, nil
default:
return nil, fmt.Errorf("unknown protocol %s", protocol)
}
return file, nil
}
func buildSockAddr(hostIP string, port uint16, isV6 bool) (int, unix.Sockaddr, error) {
// default behaviour when hostIP == "" is to bind dual-stack
// if hostIP != ""; determine the stack using the address specified
if hostIP == "" {
return unix.AF_INET6, &unix.SockaddrInet6{Port: int(port)}, nil
}
ip := net.ParseIP(hostIP)
if ip == nil {
return 0, nil, fmt.Errorf("invalid IP address: %s", hostIP)
}
if isV6 {
sa := &unix.SockaddrInet6{Port: int(port)}
copy(sa.Addr[:], ip.To16())
return unix.AF_INET6, sa, nil
}
sa := &unix.SockaddrInet4{Port: int(port)}
copy(sa.Addr[:], ip.To4())
return unix.AF_INET, sa, nil
}
func setupSocketOpts(fd, domain int, hostIP string) error {
if domain == unix.AF_INET6 {
v6only := 1
if hostIP == "" {
v6only = 0
}
if err := unix.SetsockoptInt(fd, unix.IPPROTO_IPV6, unix.IPV6_V6ONLY, v6only); err != nil {
return fmt.Errorf("cannot set IPV6_V6ONLY: %w", err)
}
}
return nil
}
func bindPortV4Fallback(protocol string, sockType int, port uint16) (*os.File, error) {
fd, err := unix.Socket(unix.AF_INET, sockType|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, fmt.Errorf("cannot create socket for %s port %d: %w", protocol, port, err)
}
if err := unix.Bind(fd, &unix.SockaddrInet4{Port: int(port)}); err != nil {
unix.Close(fd)
return nil, fmt.Errorf("cannot bind %s port %s: %w", protocol, net.JoinHostPort("", strconv.FormatUint(uint64(port), 10)), err)
}
return os.NewFile(uintptr(fd), fmt.Sprintf("reservation-%s-%d", protocol, port)), nil
}
func getOCIRuntimeError(name, runtimeMsg string) error {

View File

@@ -1343,7 +1343,7 @@ EOF
echo "$_LOG_PROMPT journalctl -u $QUADLET_SERVICE_NAME"
run -0 journalctl -eu $QUADLET_SERVICE_NAME
assert "$output" =~ "$port: bind: address already in use" "journal contains the real podman start error"
assert "$output" =~ "$port.*ddress already in use" "journal contains the real podman start error"
kill "$socat_pid"
}

View File

@@ -341,9 +341,11 @@ function run_pod_etc_hosts_test(){
# flush the firewall rule here to break port forwarding
nft delete table inet netavark || true
# check that we cannot curl (timeout after 1 sec)
# check that we cannot curl: expect either connection refused (7)
# or timeout (28) depending on whether the port reservation socket
# is in listen state
run curl --max-time 1 -s $SERVER/index.txt
assert $status -eq 28 "curl did not time out"
assert "$status" -ne 0 "curl should fail after flushing nftables"
fi
# reload the network to recreate the nftables rules
@@ -1138,4 +1140,22 @@ EOF
run_podman network rm $netname2
}
# Refer https://github.com/containers/netavark/issues/1338
# bats test_tags=ci:parallel
@test "podman run - dual-stack conflicts with explicit wildcards" {
myport=$(random_free_port)
cname1="c1-$(safename)"
run_podman run -d --name $cname1 -p $myport:8080 $IMAGE sleep inf
cid1="$output"
run_podman 126 run --rm -p 0.0.0.0:$myport:8080 $IMAGE true
assert "$output" =~ "ddress already in use" "explicit IPv4 wildcard should conflict with dual-stack"
run_podman 126 run --rm -p [::]:$myport:8080 $IMAGE true
assert "$output" =~ "ddress already in use" "explicit IPv6 wildcard should conflict with dual-stack"
run_podman rm -f -t0 $cid1
}
# vim: filetype=sh

View File

@@ -30,58 +30,6 @@ function skip_if_no_ipv6() {
fi
}
### procfs access ##############################################################
# ipv6_to_procfs() - RFC 5952 IPv6 address text representation to procfs format
# $1: Address in any notation described by RFC 5952
function ipv6_to_procfs() {
local addr="${1}"
# Add leading zero if missing
case ${addr} in
"::"*) addr=0"${addr}" ;;
esac
# Double colon can mean any number of all-zero fields. Expand to fill
# as many colons as are missing. (This will not be a valid IPv6 form,
# but we don't need it for long). E.g., 0::1 -> 0:::::::1
case ${addr} in
*"::"*)
# All the colons in the address
local colons
colons=$(tr -dc : <<<$addr)
# subtract those from a string of eight colons; this gives us
# a string of two to six colons...
local pad
pad=$(sed -e "s/$colons//" <<<":::::::")
# ...which we then inject in place of the double colon.
addr=$(sed -e "s/::/::$pad/" <<<$addr)
;;
esac
# Print as a contiguous string of zero-filled 16-bit words
# (The additional ":" below is needed because 'read -d x' actually
# means "x is a TERMINATOR, not a delimiter")
local group
while read -d : group; do
printf "%04X" "0x${group:-0}"
done <<<"${addr}:"
}
# __ipv4_to_procfs() - Print bytes in hexadecimal notation reversing arguments
# $@: IPv4 address as separate bytes
function __ipv4_to_procfs() {
printf "%02X%02X%02X%02X" ${4} ${3} ${2} ${1}
}
# ipv4_to_procfs() - IPv4 address representation to big-endian procfs format
# $1: Text representation of IPv4 address
function ipv4_to_procfs() {
IFS='.' read -r o1 o2 o3 o4 <<< $1
__ipv4_to_procfs $o1 $o2 $o3 $o4
}
### Addresses, Routes, Links ###################################################
# ipv4_get_addr_global() - Print first global IPv4 address reported by netlink
@@ -259,7 +207,7 @@ function unreserve_port() {
local port=$1
local lockfile=$PORT_LOCK_DIR/$port
-e $lockfile || die "Cannot unreserve non-reserved port $port"
test -e $lockfile || die "Cannot unreserve non-reserved port $port"
assert "$(< $lockfile)" = "$BATS_SUITE_TEST_NUMBER" \
"Port $port is not reserved by this test"
rm -f $lockfile
@@ -348,32 +296,10 @@ function port_is_bound() {
local proto="tcp"
fi
# /proc/net/tcp is insufficient: it does not show some rootless ports.
# ss does, so check it first.
run ss -${proto:0:1}nlH sport = $port
if [[ -n "$output" ]]; then
return
fi
port=$(printf %04X ${port})
case "${address}" in
*":"*)
grep -e "^[^:]*: $(ipv6_to_procfs "${address}"):${port} .*" \
-e "^[^:]*: $(ipv6_to_procfs "::0"):${port} .*" \
-q "/proc/net/${proto}6"
;;
*"."*)
grep -e "^[^:]*: $(ipv4_to_procfs "${address}"):${port}" \
-e "^[^:]*: $(ipv4_to_procfs "0.0.0.0"):${port}" \
-e "^[^:]*: $(ipv4_to_procfs "127.0.0.1"):${port}" \
-q "/proc/net/${proto}"
;;
*)
# No address: check both IPv4 and IPv6, for any bound address
grep "^[^:]*: [^:]*:${port} .*" -q "/proc/net/${proto}6" || \
grep "^[^:]*: [^:]*:${port} .*" -q "/proc/net/${proto}"
;;
esac
# Use ss to check the local ports
run -0 ss -${proto:0:1}nlH state all sport = $port
# grep for exact address:port match or for "*:port" which means the port is bound to all addresses and hence bound for any address
grep -q "$address:$port" <<<"$output" || grep -q "*:$port" <<<"$output"
}
# port_is_free() - Check if TCP or UDP port is free to bind for a given address

View File

@@ -232,23 +232,6 @@ check_result "$found" "16700" "random_free_port"
# END random_free_port
###############################################################################
# BEGIN ipv6_to_procfs
# Table of IPv6 short forms and their procfs equivalents. For readability,
# spaces separate each 16-bit word. Spaces are removed when testing.
table="
2b06::1 | 2B06 0000 0000 0000 0000 0000 0000 0001
::1 | 0000 0000 0000 0000 0000 0000 0000 0001
0::1 | 0000 0000 0000 0000 0000 0000 0000 0001
"
while read shortform expect; do
actual=$(ipv6_to_procfs $shortform)
check_result "$actual" "${expect// }" "ipv6_to_procfs $shortform"
done < <(parse_table "$table")
# END ipv6_to_procfs
###############################################################################
# BEGIN subnet_in_use ... because that's complicated
# Override ip command