Linux内核网络的连接跟踪conntrack简单分析
连接跟踪conntrack的基本信息
Linux内核的conntrack模块是网络过滤子系统netfilter重要组成部分,它是网络地址转换NAT
和防火墙等网络功能的基础。Linux内核中一个连接(可以为UDP
或TCP
,或其他)的建立是一个冗长耗时的过程,例如,该连接经过内核过滤规则(对应防火墙的规则)或端口转发等规则的确认,最终成功建立。当连接建立后,如何避免后续数据量庞大、数量众多的网络包快速检测通过(从而降低Linux内核网络的负载),跟踪连接是十分必要的。为了跟踪一个已存在的网络连接,Linux内核(版本为6.6.67
)使了以下结构体作为一个连接的指纹:
/* include/net/netfilter/nf_conntrack_tuple.h */
/* This contains the information to distinguish a connection. */
struct nf_conntrack_tuple {struct nf_conntrack_man src;/* These are the parts of the tuple which are fixed. */struct {union nf_inet_addr u3; union {/* Add other protocols here. */__be16 all;struct {__be16 port;} tcp;struct {__be16 port;} udp;
......
}
可以看到,它包含了一个连接的重要信息:源和目标IP地址、源和目标端口号等。对于NAT
,它还包含转换的IP地址和端口号等。该nf_conntack_tuple
结构体在内核函数nf_ct_get_tuple
中被填充:
/* net/netfilter/nf_conntrack_core.c */
static bool
nf_ct_get_tuple(const struct sk_buff *skb,unsigned int nhoff,unsigned int dataoff,u_int16_t l3num,u_int8_t protonum,struct net *net,struct nf_conntrack_tuple *tuple)
{unsigned int size;const __be32 *ap; __be32 _addrs[8];memset(tuple, 0, sizeof(*tuple));
之后通过 __nf_conntrack_find_get
函数将该结构体映射到struct nf_conn
指针;可以把这一过程简化成把nf_conntrack_tuple
结构体作为一个哈稀表的键值,查找得到struct nf_conn
指针:
/* net/netfilter/nf_conntrack_core.c */
/* Find a connection corresponding to a tuple. */
static struct nf_conntrack_tuple_hash *
__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,const struct nf_conntrack_tuple *tuple, u32 hash)
{struct nf_conntrack_tuple_hash *h;struct nf_conn *ct; h = ____nf_conntrack_find(net, zone, tuple, hash);......ct = nf_ct_tuplehash_to_ctrack(h);
最后,结构体struct nf_conn
包含了一个已建立的(严格地说,也包含待建立的)链接的状态息:
/* incude/net/netfilter/nf_conntrack_core.c */
struct nf_conn {struct nf_conntrack ct_general;spinlock_t lock;/* jiffies32 when this ct is considered dead */u32 timeout;#ifdef CONFIG_NF_CONNTRACK_ZONESstruct nf_conntrack_zone zone;
#endif/* XXX should I move this to the tail ? - Y.K *//* These are my tuples; original and reply */struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];/* Have we seen traffic both ways yet? (bitset) */unsigned long status;
这里我们重点关注timeout/status
;其中timeout
以jiffies
为单位,表示该连接跟踪的失效的时间;status
则提供了该连接的状态比特标志位等信息。
UDP的连接状态跟踪
笔者为了加深对conntrack
的了解,修改了busybox的代码,在其中增加了绑定本地端口的功能:
diff --git a/libbb/xconnect.c b/libbb/xconnect.c
index 0e0b247..6456c65 100644
--- a/libbb/xconnect.c
+++ b/libbb/xconnect.c
@@ -369,6 +369,25 @@ int FAST_FUNC xsocket_type(len_and_sockaddr **lsap, int family, int sock_type)lsa = xzalloc(LSA_LEN_SIZE + len);lsa->len = len;lsa->u.sa.sa_family = family;
+
+ /* bind to local port number for IPv4/IPv6 */
+ if (family == AF_INET || family == AF_INET6) {
+ int pno = -1;
+ const char * lport = getenv("BB_PORTNO");
+ if (lport && lport[0])
+ pno = (int) strtol(lport, NULL, 0);
+ if (pno > 0 && pno < 65536) {
+ if (family == AF_INET) {
+ struct sockaddr_in * addr;
+ addr = (struct sockaddr_in *) &(lsa->u.sa);
+ addr->sin_port = htons((unsigned short) pno);
+ } else {
+ struct sockaddr_in6 * addr;
+ addr = (struct sockaddr_in6 *) &(lsa->u.sa);
+ addr->sin6_port = htons((unsigned short) pno);
+ }
+ }
+ }*lsap = lsa;return fd;}
这样,通过配置环境变量BB_PORTNO
固定为4321
,可以强制nslookup
命令行工具多次调用时,使用同一端口:
root@localhost:~# export BB_PORTNO=4321
root@localhost:~# nslookup www.baidu.com 223.5.5.5
Server: 223.5.5.5
Address: 223.5.5.5:53Non-authoritative answer:
www.baidu.com canonical name = www.a.shifen.com
Name: www.a.shifen.com
Address: 223.109.82.16
Name: www.a.shifen.com
Address: 223.109.82.212
以上命令在PC侧执行;此时,在路由器设备上(笔者使用了树莓派做软路由),可以通过conntrack
命令行工具查看连建立的UDP
连接信息:
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 49 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=1 bytes=90 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=2 bytes=266 mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 46 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=1 bytes=90 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=2 bytes=266 mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 176 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=2 bytes=180 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=4 bytes=532 [ASSURED] mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 174 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=3 bytes=270 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=6 bytes=798 [ASSURED] mark=0 use=1
其中,17为网络协议编号,对应UDP
;之后的数值单位为秒,即该连接跟踪在多少秒后超时。超时后,DNS
服务器223.5.5.5
的回应不会被软路由NAT转发。注意到,一开始该UDP
连接的超时时间分别为49秒和46秒;但之后变成了176秒,这是笔者在PC上多次执行nslookup www.baidu.com 223.5.5.5
命令的结果;简单地说,当一个UDP
有了初次的回应后,它的超时时间会变成60秒;当有多次回应后,超时时间会变成180秒。这一变化过程下面有相关说明。
UDP连接的状态的内核调试
笔者编写了一个简单的bpftrace脚本,用于跟踪UDP
的连接状态信息:
#!/usr/bin/bpftrace#include <net/netfilter/nf_conntrack.h>kprobe:nf_conntrack_udp_packet {$c = (struct nf_conn *) arg0;printf("%8d.%06d: PID: %d, comm: %s, nf_conntrack_udp_packet(0x%lx, 0x%lx, 0x%lx), status: 0x%x, timeout: %u",elapsed / 1000000, elapsed % 1000000, pid, comm, arg0, arg1, arg2, $c->status, $c->timeout);print(kstack);
}kretprobe:__nf_conntrack_alloc {printf("%8d.%06d: PID: %d, comm: %s, __nf_conntrack_alloc has returned: 0x%lx",elapsed / 1000000, elapsed % 1000000, pid, comm, retval);print(kstack);
}kprobe:nf_conntrack_free {printf("%8d.%06d: PID: %d, comm: %s, nf_conntrack_free(0x%lx)",elapsed / 1000000, elapsed % 1000000, pid, comm, arg0);print(kstack);
}
使用该脚本对这一过程进行调试,得到的结果如下(调试结果有精简):
4675.050648: PID: 0, comm: swapper/0, __nf_conntrack_alloc has returned: 0xffffff8006469200init_conntrack.isra.0+976nf_conntrack_in+912ipv4_conntrack_in+24nf_hook_slow+72br_nf_pre_routing+444br_handle_frame+404__netif_receive_skb_core.constprop.0+500__netif_receive_skb_one_core+44process_backlog+168__napi_poll.constprop.0+56net_rx_action+344handle_softirqs+352__softirqentry_text_start+20____do_softirq+164675.180752: PID: 0, comm: swapper/0, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff80050e0e00, 0x14), status: 0x0, timeout: 0nf_conntrack_udp_packet+0ipv4_conntrack_in+24nf_hook_slow+72br_nf_pre_routing+444br_handle_frame+404__netif_receive_skb_core.constprop.0+500__netif_receive_skb_one_core+44process_backlog+168__napi_poll.constprop.0+56net_rx_action+344handle_softirqs+3524681.793946: PID: 2774, comm: bpftrace, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681b00, 0x14), status: 0x198, timeout: 372070nf_conntrack_udp_packet+0ipv4_conntrack_in+24nf_hook_slow+72ip_rcv+92__netif_receive_skb_one_core+72process_backlog+168__napi_poll.constprop.0+56net_rx_action+344handle_softirqs+3524682.189260: PID: 2774, comm: bpftrace, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681600, 0x14), status: 0x19a, timeout: 372071nf_conntrack_udp_packet+0ipv4_conntrack_in+24nf_hook_slow+72ip_rcv+92__netif_receive_skb_one_core+72process_backlog+168__napi_poll.constprop.0+56net_rx_action+344handle_softirqs+352
当一个连接生成时,会调用__nf_conntrack_alloc
函数分配连接跟踪结构体nf_conn
。函数nf_conntrack_udp_packet
用于检查并更新一个UDP
连接的跟踪信息;第一次调用时,可以看到nf_conn
中的status
和timeout
都为0,此时会默认使用以下代码更新跟踪信息:
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
注意,timeouts[UDP_CT_UNREPLIED]
默认值为 30*HZ
,但openwrt
系统将之配置为60秒:
root@OpenWrt:~# cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout
60
root@OpenWrt:~# cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream
180
针对笔者使用的树莓派设备,使能了内核选项CONFIG_HZ_100=y
,那么HZ
值为100;上面的调试结果:status: 0x198, timeout: 372070
,表明该UDP
连接跟踪会在启动系动的第3720.7
秒后超时失效。可以确定,该UDP
连接是树莓派设备启动的第3720.7 - 60
秒,即第3660.7秒时从PC机上收到的(此时树莓派启动了约一小时)。下面会有数据的变化与此印证。此时,该UDP连接的状态位为0x198,对应着:
-----------------------------------------------Value [0x198] (0x198, 408):28 24 20 16 12 8 4 00000 0000 0000 0000 0000 0001 1001 1000 31 27 23 19 15 11 7 3/* Connection is confirmed: originating packet has left box */IPS_CONFIRMED_BIT = 3,IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT),/* Connection needs src nat in orig dir. This bit never changed. */IPS_SRC_NAT_BIT = 4,IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT)
上面调用了nf_conntrack_udp_packet
函数两次,分别对应对DNS 223.5.5.5
服务器的一收一发,连接已确认,第3位比特会置1。
下面笔者再次(即第二次)在PC机上执行了nslookup www.baidu.com 223.5.5.5
,但连接跟踪信息的超时时间没有变化,仍是系统启动的第3720.7
秒。此时,状态位由之前的0x198
变为0x19a
,即第1位置1(其实是第二次调用nf_conntrack_udp_packet
函数返回后的状态值):
/* We've seen packets both ways: bit 1 set. Can be set, not unset. */IPS_SEEN_REPLY_BIT = 1,IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT),
第二次执行nslookup
的内核调试结果如下:
21181.259131: PID: 0, comm: swapper/0, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681e00, 0x14), status: 0x19a, timeout: 372071nf_conntrack_udp_packet+0ipv4_conntrack_in+24nf_hook_slow+72br_nf_pre_routing+444br_handle_frame+404__netif_receive_skb_core.constprop.0+500__netif_receive_skb_one_core+44process_backlog+168__napi_poll.constprop.0+56net_rx_action+344handle_softirqs+35221187.751231: PID: 308, comm: kworker/u13:1, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff80065e6700, 0x14), status: 0x19e, timeout: 385721nf_conntrack_udp_packet+0ipv4_conntrack_in+24nf_hook_slow+72ip_rcv+92__netif_receive_skb_one_core+72process_backlog+168__napi_poll.constprop.0+56net_rx_action+344handle_softirqs+352
除了status
中的比特位2置1外:
/* Conntrack should never be early-expired. */IPS_ASSURED_BIT = 2,IPS_ASSURED = (1 << IPS_ASSURED_BIT),
该UDP跟踪信息的超时时间由原来的372071
变成了385721
,二者相差了13650 jiffies
,对应着135.5秒;也就是说,内核把这个UDP连接跟踪失效的时间在原来的基础上又推迟了136.5秒,这个超时时间接近nf_conntrack_udp_timeout_stream
中指定的180超时时间:
cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream
180
最后,UDP
连接跟踪的状态更新函数内容如下:
/* net/netfilter/nf_conntrack_proto_udp.c */
int nf_conntrack_udp_packet(struct nf_conn *ct,struct sk_buff *skb,unsigned int dataoff,enum ip_conntrack_info ctinfo,const struct nf_hook_state *state)
{unsigned int *timeouts;unsigned long status;if (udp_error(skb, dataoff, state))return -NF_ACCEPT;timeouts = nf_ct_timeout_lookup(ct);if (!timeouts)timeouts = udp_get_timeouts(nf_ct_net(ct));status = READ_ONCE(ct->status);if ((status & IPS_CONFIRMED) == 0)ct->proto.udp.stream_ts = 2 * HZ + jiffies;/* If we've seen traffic both ways, this is some kind of UDP* stream. Set Assured.*/if (status & IPS_SEEN_REPLY) {unsigned long extra = timeouts[UDP_CT_UNREPLIED];bool stream = false;/* Still active after two seconds? Extend timeout. */if (time_after(jiffies, ct->proto.udp.stream_ts)) {extra = timeouts[UDP_CT_REPLIED];stream = (status & IPS_ASSURED) == 0;} nf_ct_refresh_acct(ct, ctinfo, skb, extra);/* never set ASSURED for IPS_NAT_CLASH, they time out soon */if (unlikely((status & IPS_NAT_CLASH)))return NF_ACCEPT;/* Also, more likely to be important, and not a probe */if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))nf_conntrack_event_cache(IPCT_ASSURED, ct);} else {nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);}return NF_ACCEPT;
}
以上代码中的UDP_CT_REPLIED
即对应内核配置/proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream
的值(UDP_CT_UNREPLIED
对应nf_conntrack_udp_timeout
)。注意到,把超时时间更新到timeouts[UDP_CT_REPLIED]
是有条件的,其条件就是间隔两秒之后仍有数据活动(从而该连接被视为持续活动的连接,即udp_stream
)。
上面提到,连接跟踪结构体struct nf_conn
包含了一些NAT
的信息,这个信息是网络地址转换需要的;例如上面的status
字段中第4位比待位置1(对应IPS_SRC_NAT
),则以下代码会执行:
/* net/netfilter/nf_conntrack_core.c */
static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, ...) {if (ct->status & IPS_SRC_NAT) {memcpy(tuple.src.u3.all,ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,sizeof(tuple.src.u3.all));tuple.src.u.all =ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;}......if (status & IPS_SRC_NAT &&nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,IP_CT_DIR_ORIGINAL) == NF_DROP)return -1;
}
上面根据标志位IPS_SRC_NAT
更新了tuple
中的UDP源地址。之后调用了manip_pkt
来进一步处理。下面笔者找到了函数__udp_manip_pkt
对应的汇编代码,编写了另一个bpftrace
脚本,查看对网络数据包的UDP端口的修改:
#!/usr/bin/bpftrace/*
net/netfilter/nf_nat_proto.c
static void __udp_manip_pkt(struct sk_buff *skb,unsigned int iphdroff, struct udphdr *hdr,const struct nf_conntrack_tuple *tuple,enum nf_nat_manip_type maniptype, bool do_csum)
{__be16 *portptr, newport;......*portptr = newport; // => 0xffffffc080810410 <l4proto_manip_pkt+428>: strh w25, [x24]
}Dump of assembler code from 0xffffffc080810400 to 0xffffffc080810420:0xffffffc080810400 <l4proto_manip_pkt+412>: cbnz w21, 0xffffffc08081063c <l4proto_manip_pkt+984>0xffffffc080810404 <l4proto_manip_pkt+416>: ldrh w25, [x22, #16]0xffffffc080810408 <l4proto_manip_pkt+420>: mov x24, x190xffffffc08081040c <l4proto_manip_pkt+424>: cbnz w0, 0xffffffc080810398 <l4proto_manip_pkt+308>0xffffffc080810410 <l4proto_manip_pkt+428>: strh w25, [x24]*/
kprobe:l4proto_manip_pkt+0x1ac {$n = reg("r25");$r = (uint16 *) reg("r24");$o = *kptr($r);$o = ($o >> 8) | (($o << 8) & 0x00FF00);$n = ($n >> 8) | (($n << 8) & 0x00FF00);printf("PID: %d, comm: %s, UDP/NAT replacing port from %d to %d",pid, comm, $o, $n);print(kstack);
}
上面脚本的调试结果只有把路由端口修改为4321
端口的操作,却没有把4321
端口替代成路由端口的操作,需要进一步探究:
PID: 0, comm: swapper/1, UDP/NAT replacing port from 57616 to 4321l4proto_manip_pkt+428nf_nat_ipv4_manip_pkt+116nf_nat_manip_pkt+192nf_nat_inet_fn+460nf_nat_ipv4_pre_routing+84nf_hook_slow+72ip_rcv+92
跟踪连接的超时失效
结构体struct nf_conn
保存了已建立连接的基本信息;当一个nf_conn
失效时,Linux内核会丢弃该数据包(必要时返回TCP/RST
或icmp/unreachable
),因为不知道如何对该数据包进行NAT转发。连接跟踪超时的判断,目前的调试观察到有两种方式,分别是内核工作线徎周期性检测,和应用层的netlink
访问(例如上面的conntrack
命令行工具)。当一个连接跟踪失效时,会调用nf_conntrack_free
释放内存:
146678.923239: PID: 36, comm: kworker/u12:0, nf_conntrack_free(0xffffff8006469900)nf_conntrack_free+0nf_ct_gc_expired.part.0+152nf_ct_gc_expired+96gc_worker+592process_one_work+408worker_thread+768kthread+220ret_from_fork+16235795.008082: PID: 2817, comm: conntrack, nf_conntrack_free(0xffffff8006469200)nf_conntrack_free+0ctnetlink_dump_table+1024netlink_dump+300__netlink_dump_start+364ctnetlink_get_conntrack+484nfnetlink_rcv_msg+560netlink_rcv_skb+96nfnetlink_rcv+108netlink_unicast+488netlink_sendmsg+412__sys_sendto+224__arm64_sys_sendto+40invoke_syscall.constprop.0+92do_el0_svc+64el0_svc+48el0t_64_sync_handler+288el0t_64_sync+376
至此,我们就对Linux内核的连接跟踪conntrack
有了初步的了解;这一块比较复杂,值得深入探究。