Linux 网络:邻居子系统
文章目录
- 1. 前言
- 2. 邻居子系统的建立过程
- 3. 小结
1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. 邻居子系统的建立过程
在发送数据的过程中,如果是第一次对某个邻居设备节点发送数据,先将数据缓存到邻居对象 neighbour 的 arp_queue,然后向邻居节点发送一个 ARP 请求,以获取邻居的 MAC 地址,并记录邻居的信息到哈希表 neigh_table,最后再向邻居发送缓存在 arp_queue 中的数据;
后续的发送则直接查找邻居哈希表 neigh_table(如 arp_tbl) 获取邻居设备的信息,然后直接发送数据。
下面以以太网卡的 TCP 数据包的发送过程为例,来说明过程的代码流程细节:
sys_send()...tcp_sendmsg_locked()...__tcp_transmit_skb()ip_queue_xmit()...ip_output()...ip_finish_output2()/* net/ipv4/ip_output.c */
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{...rcu_read_lock_bh();nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); /* 对于以太网 IP 包,@nexthop 是 邻居 的 IP 地址 */neigh = __ipv4_neigh_lookup_noref(dev, nexthop); /* 查找邻居的硬件地址信息 *//** 如果还没有邻居的地址信息,则说明是第 1 次向邻居发送数据, * 则创建 neightbour 对象并插入到 neigh_table 表 arp_tbl 。*/if (unlikely(!neigh))neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);if (!IS_ERR(neigh)) {int res;sock_confirm_neigh(skb, neigh);/** 向邻居发送数据 @skb: * neigh_direct_output(), neigh_resolve_output(), ... */res = neigh_output(neigh, skb); rcu_read_unlock_bh();return res;}rcu_read_unlock_bh();...
}
先看第一次向邻居发送信息时,创建新 neighbour 对象记录邻居信息,并最终插入 neigh_table arp_tbl 的过程:
/* net/core/neighbour.c */struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,struct net_device *dev, bool want_ref)
{...struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); /* 建立一个邻居对象 */......n->dev = dev; /* 设定用于向邻居发送数据的网络接口设备 */.../* Protocol specific setup. *//* 各协议特定的 邻居对象 初始化 */if (tbl->constructor && (error = tbl->constructor(n)/*arp_constructor(),...*/) < 0) {...}/* 网络设备驱动特定的 邻居对象 初始化 */if (dev->netdev_ops->ndo_neigh_construct) {error = dev->netdev_ops->ndo_neigh_construct(dev, n);...}.../** 邻居表哈希函数生成 neighbour 对象哈希值,用于确定 * neighbour 对象插入邻居 @tbl 中哈希链表的位置:* arp_hash(), ... */hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);.../* 在哈希链中确认插入位置 */for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],lockdep_is_held(&tbl->lock));n1 != NULL;n1 = rcu_dereference_protected(n1->next,lockdep_is_held(&tbl->lock))) {if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {if (want_ref)neigh_hold(n1);rc = n1;goto out_tbl_unlock;}}.../* 将新建的 neighbour 插入 neigh_table @tbl 的哈希链表 */rcu_assign_pointer(n->next,rcu_dereference_protected(nht->hash_buckets[hash_val],lockdep_is_held(&tbl->lock)));rcu_assign_pointer(nht->hash_buckets[hash_val], n);...rc = n;
out:return rc;...
}
neighbour 对象以太网 IPv4 协议的特定的初始化:
/* net/ipv4/arp.c */static int arp_constructor(struct neighbour *neigh)
{__be32 addr; /* 邻居的 IP 地址 */......addr = *(__be32 *)neigh->primary_key;.../* 设定 neighbour 的路由表 ID: RT_TABLE_LOCAL, RT_TABLE_MAIN, ... */neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);...if (!dev->header_ops) {...} else {...if (dev->header_ops->cache) /* 以太网设备: eth_header_cache(), ... */neigh->ops = &arp_hh_ops;else...if (neigh->nud_state & NUD_VALID)...elseneigh->output = neigh->ops->output; /* neigh_resolve_output() */}return 0;
}...struct neigh_table arp_tbl = {.family = AF_INET,.key_len = 4,.protocol = cpu_to_be16(ETH_P_IP),.hash = arp_hash,.key_eq = arp_key_eq,.constructor = arp_constructor,.proxy_redo = parp_redo,.id = "arp_cache",.parms = {.tbl = &arp_tbl,.reachable_time = 30 * HZ,.data = {[NEIGH_VAR_MCAST_PROBES] = 3,[NEIGH_VAR_UCAST_PROBES] = 3,[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,[NEIGH_VAR_GC_STALETIME] = 60 * HZ,[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,[NEIGH_VAR_PROXY_QLEN] = 64,[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,[NEIGH_VAR_LOCKTIME] = 1 * HZ,},},.gc_interval = 30 * HZ,.gc_thresh1 = 128,.gc_thresh2 = 512,.gc_thresh3 = 1024,
};
EXPORT_SYMBOL(arp_tbl);
为邻居设备创建 neighbour 对象后,接下来就是数据的发送了。接前面的 neigh_output(),该函数负责向外发送数据:
ip_finish_output2()neigh_output()
/* include/net/neighbour.h */static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
{const struct hh_cache *hh = &n->hh;if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)...elsereturn n->output(n, skb); /* neigh_resolve_output() */
}
/* net/core/neighbour.c *//* Slow and careful. */int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{.../** . 第一次向邻居发送数据时,查询邻居 @neigh 的地址,如发送以太网的 ARP 请求,返回非 0 值;* . 后续返回 0 值,也不再向邻居发送请求 ARP 请求*/if (!neigh_event_send(neigh, skb)) {...}
}
/* include/net/neighbour.h *//* 查询邻居 @neigh 的地址,如发送以太网的 ARP 请求 */
static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{...if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))return __neigh_event_send(neigh, skb);...
}```c
/* net/core/neighbour.c */int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{int rc;bool immediate_probe = false;...if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +NEIGH_VAR(neigh->parms, APP_PROBES)) {unsigned long next, now = jiffies;atomic_set(&neigh->probes,NEIGH_VAR(neigh->parms, UCAST_PROBES));neigh->nud_state = NUD_INCOMPLETE;neigh->updated = now;next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),HZ/2);neigh_add_timer(neigh, next);immediate_probe = true;} else {...}} else if (neigh->nud_state & NUD_STALE) {...}if (neigh->nud_state == NUD_INCOMPLETE) {if (skb) {...skb_dst_force(skb);__skb_queue_tail(&neigh->arp_queue, skb); /* 将要发送的数据添加到 neighbour 的 arp_queue */neigh->arp_queue_len_bytes += skb->truesize;}}out_unlock_bh:if (immediate_probe)neigh_probe(neigh);else......
}static void neigh_probe(struct neighbour *neigh)__releases(neigh->lock)
{...if (neigh->ops->solicit)neigh->ops->solicit(neigh, skb); /* arp_solicit(), ... */...
}
/* net/ipv4/arp.c */static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{.../* 构建 + 发送 ARP 包 */arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,dst_hw, dev->dev_addr, NULL, dst);
}/* Create and send an arp packet. */
static void arp_send_dst(int type, int ptype, __be32 dest_ip,struct net_device *dev, __be32 src_ip,const unsigned char *dest_hw,const unsigned char *src_hw,const unsigned char *target_hw,struct dst_entry *dst)
{struct sk_buff *skb;......skb = arp_create(type, ptype, dest_ip, dev, src_ip,dest_hw, src_hw, target_hw);...skb_dst_set(skb, dst_clone(dst));arp_xmit(skb); /* 发送 ARP 包 */
}
在向邻居发送 ARP 请求数据包后,等待邻居对 ARP 请求的回复,然后发送数据包。注意,这时候数据包还没有发送,还缓存在邻居对象的 arp_queue 队列中,等待邻居回复 ARM 请求后,才将缓存的在 arp_queue 队列中数据包发送出去。来看细节:
// 网卡接收数据时,会产生中断信号,网卡的中断接口成为接收数据的起点
xxx_nic_interrupt()...xxx_nic_rx() // 网卡的数据接收接口...__netif_receive_skb_core()
/* net/core/dev.c */static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{.../* 对于 ARP 协议包,调用回调 arp_rcv() */if (likely(!deliver_exact)) {deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,&ptype_base[ntohs(type) &PTYPE_HASH_MASK]);}...
}
/* 处理收到的 ARP 协议广播请求包 */
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,struct packet_type *pt, struct net_device *orig_dev)
{...return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,dev_net(dev), NULL, skb, dev, NULL,arp_process);
}static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{...
#if 0// 邻居设备收到 ARP 请求包时,做出回复if (arp->ar_op == htons(ARPOP_REQUEST)/*收到 ARP 广播请求*/ &&ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {...if (addr_type == RTN_LOCAL) {...if (!dont_send) {n = neigh_event_ns(&arp_tbl, sha, &sip, dev);if (n) {/*回复发出 ARP 广播请求的主机*/arp_send_dst(ARPOP_REPLY, ETH_P_ARP,sip, dev, tip, sha,dev->dev_addr, sha,reply_dst);neigh_release(n);}}...}}
#endif.../** 本机收到邻居对 ARP 请求的回复后的处理。*//* 从 ARP 解析表 查找 {@sip, @dev} 对应的 neighbour */n = __neigh_lookup(&arp_tbl, &sip, dev, 0);...if (lladdr != neigh->ha) {...memcpy(&neigh->ha, lladdr, dev->addr_len); /* 记录邻居的硬件地址 */...}...if (n) {int state = NUD_REACHABLE;.../** 收到 ARP 请求后更新邻居的状态,记录地址信息,* 然后发送挂起在 neighbour::arp_queue 中的数据包* (如第一次向邻居发送的数据,需先向邻居发送 ARP* 请求获取 MAC 地址后,才能继续发送)。*/neigh_update(n, sha, state,override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);...}...
}
/* net/core/neighbour.c */int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,u32 flags, u32 nlmsg_pid)
{...if (!(new & NUD_VALID)) {...neigh->nud_state = new; /* 更新 neighbour 状态 ( NUD_REACHABLE) */...}...if (!(old & NUD_VALID)) {struct sk_buff *skb;/* Again: avoid dead loop if something went wrong *//** 发送缓存在 arp_queue 中的数据。* 如第一次向邻居发送的数据,需先向邻居发送 ARP 请求获取 MAC 地址后,才能继续发送。*/while (neigh->nud_state & NUD_VALID &&(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {...n1->output(n1, skb); /* 向邻居发送数据包: neigh_resolve_output() */...}}...
}int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{.../** . 第一次向邻居发送数据时,查询邻居 @neigh 的地址,如发送以太网的 ARP 请求,返回非 0 值;* . 后续返回 0 值,也不再向邻居发送请求 ARP 请求*/if (!neigh_event_send(neigh, skb)) {...do {__skb_pull(skb, skb_network_offset(skb));seq = read_seqbegin(&neigh->ha_lock);err = dev_hard_header(skb, dev, ntohs(skb->protocol),neigh->ha, NULL, skb->len); /* 构建链路层帧头,如以太网帧头 */} while (read_seqretry(&neigh->ha_lock, seq));if (err >= 0)rc = dev_queue_xmit(skb); /* 通过网络设备向外发送数据 */elsegoto out_kfree_skb;}...
}
3. 小结
从前面的分析可以看到,邻居子系统是在发送数据包的过程中建立的,我们来简单的小结一下:
1. 第一次向邻居发送数据时过程IP 层发 skb -> 查找邻居表 arp_tbl,查找失败 -> 将 skb 缓存到 arp_queue -> 向邻居发送 ARP 请求 ->|
将 arp_queue 缓存 skb 发送给邻居 <- 记录邻居地址到邻居表 arp_tbl <- 邻居回复 ARP 请求(含 MAC) <-
2. 后续向邻居发送数据IP 层发 skb -> 查找邻居表 arp_tbl,查找成功 -> 向邻居发送 ARP 请求
