UDP報文接收概述
UDP數據報的接收要分兩部分來看:
- 網絡層接收完數據包后遞交給UDP后,UDP的處理過程。該過程UDP需要做的工作就是接收數據包并對其進行校驗,校驗成功后將其放入接收隊列 sk_receive_queue 中等待用戶空間程序來讀取。
- 用戶空間程序調用read()等系統調用讀取已經放入接收隊列 sk_receive_queue 中的數據。
從IP層接收數據包 udp_rcv()
該函數是在AF_INET協議族初始化時,由UDP注冊給網絡層的回調函數,當網絡層代碼處理完一個輸入數據包后,如果該數據包是發往本機的,并且其上層協議就是UDP,那么會調用該回調函數。
int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}
@skb: 輸入數據包
@udptable:已綁定端口的UDP傳輸控制塊,將從該哈希表查找給skb屬于哪個套接字
@proto:L4協議號,到這里可能是IPPROTO_UDP或者IPPROTO_UDPLITE
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
struct sock *sk;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
/*
* Validate the packet.
*/
//調整SKB內部數據布局,使得線性地址空間中至少包含UDP首部
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto drop; /* No space for header. */
uh = udp_hdr(skb);
ulen = ntohs(uh->len);
//skb中的數據長度不能小于UDP首部指示的數據包長度,即數據包是完整的
if (ulen > skb->len)
goto short_packet;
if (proto == IPPROTO_UDP) {
//1. UDP數據包長度必須大于首部長度
//2. pskb_trim_rcum()會去掉可能的填充(UDP數據包過小,IP可能會填充),然后重新計算校驗和
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
goto short_packet;
uh = udp_hdr(skb);
}
//計算校驗和
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
//獲取數據包中的源IP和目的IP地址
saddr = ip_hdr(skb)->saddr;
daddr = ip_hdr(skb)->daddr;
//對于多播或者廣播報文的處理
if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable);
//根據報文的源端口號和目的端口號查詢udptable,尋找應該接收該數據包的傳輸控制塊
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
//找到了處理該數據包的傳輸控制塊,調用udp_queue_rcv_skb()接收數據包
if (sk != NULL) {
int ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
*/
if (ret > 0)
return -ret;
return 0;
}
//到這里,說明沒有傳輸控制塊接收該數據包,做些統計然后丟棄該數據包
//IPSec相關
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
//累計輸入數據包錯誤統計值,并且回復端口不可達ICMP報文
UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
kfree_skb(skb);
return 0;
short_packet:
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%un",
proto == IPPROTO_UDPLITE ? "-Lite" : "",
&saddr,
ntohs(uh->source),
ulen,
skb->len,
&daddr,
ntohs(uh->dest));
goto drop;
csum_error:
/*
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %dn",
proto == IPPROTO_UDPLITE ? "-Lite" : "",
&saddr,
ntohs(uh->source),
&daddr,
ntohs(uh->dest),
ulen);
drop:
UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
return 0;
}
疑惑:為何校驗和的計算和驗證要分udp4_csum_init()和udp_lib_checksum_complete()兩步完成???
查找數據包所屬套接字 __udp4_lib_lookup_skb()
如上,非常關鍵的一步就是根據數據包中目的地址信息尋找應該由誰來處理該數據包。
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport,
struct udp_table *udptable)
{
struct sock *sk;
const struct iphdr *iph = ip_hdr(skb);
//在網絡層可能已經為該數據包查詢過傳輸控制塊了,這時會將查詢結果記錄到skb->sk中
if (unlikely(sk = skb_steal_sock(skb)))
return sk;
else
//之前沒有查詢過,繼續查詢
return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
udptable);
}
@dif: 該數據包的輸入網絡設備接口
static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
int dif, struct udp_table *udptable)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
//目的端口號為哈希表的key
unsigned short hnum = ntohs(dport);
unsigned int hash = udp_hashfn(net, hnum);
struct udp_hslot *hslot = &udptable->hash[hash];
int score, badness;
rcu_read_lock();
begin:
//遍歷沖突鏈,尋找一個分值最高的保存到result中
result = NULL;
badness = -1;
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
score = compute_score(sk, net, saddr, hnum, sport,
daddr, dport, dif);
if (score > badness) {
result = sk;
badness = score;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != hash)
goto begin;
if (result) {
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, saddr, hnum, sport,
daddr, dport, dif) < badness)) {
sock_put(result);
goto begin;
}
}
rcu_read_unlock();
return result;
}
疑惑:查個表為什么這么復雜,這個分值什么鬼???
數據包進入隊列 udp_queue_rcv_skb()
找到數據包目的端口對應的傳輸控制塊后,會調用該函數接收該數據包。
/* returns:
* -1: error
* 0: success
* >0: "udp encap" protocol resubmission
*
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int rc;
int is_udplite = IS_UDPLITE(sk);
//IPSec相關
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
//IPSeck相關處理
if (up->encap_type) {
/*
* This is an encapsulation socket so pass the skb to
* the socket's udp_encap_rcv() hook. Otherwise, just
* fall through and pass this up the UDP socket.
* up->encap_rcv() returns the following value:
* =0 if skb was successfully passed to the encap
* handler or was discarded by it.
* >0 if skb should be passed on to UDP.
* <0 if skb should be resubmitted as proto -N
*/
/* if we're overly short, let UDP handle it */
if (skb->len > sizeof(struct udphdr) &&
up->encap_rcv != NULL) {
int ret;
ret = (*up->encap_rcv)(sk, skb);
if (ret <= 0) {
UDP_INC_STATS_BH(sock_net(sk),
UDP_MIB_INDATAGRAMS,
is_udplite);
return -ret;
}
}
/* FALLTHROUGH -- it's a UDP Packet */
}
//UDPlite相關處理
if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
/*
* MIB statistics other than incrementing the error count are
* disabled for the following two types of errors: these depend
* on the Application settings, not on the functioning of the
* protocol stack as such.
*
* RFC 3828 here recommends (sec 3.3): "There should also be a
* way ... to ... at least let the receiving application block
* delivery of packets with coverage values less than a value
* provided by the application."
*/
if (up->pcrlen == 0) { /* full coverage was set */
LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
"%d while full coverage %d requestedn",
UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
/* The next case involves violating the min. coverage requested
* by the receiver. This is subtle: if receiver wants x and x is
* greater than the buffersize/MTU then receiver will complain
* that it wants x while sender emits packets of smaller size y.
* Therefore the above ...()->partial_cov statement is essential.
*/
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
LIMIT_NETDEBUG(KERN_WARNING
"UDPLITE: coverage %d too small, need min %dn",
UDP_SKB_CB(skb)->cscov, up->pcrlen);
goto drop;
}
}
//如果設置了套接口過濾器時,那么需要提前進行校驗和的處理,保證傳給過濾器的數據包一定是校驗通過的
if (sk->sk_filter) {
if (udp_lib_checksum_complete(skb))
goto drop;
}
rc = 0;
//鎖定socket
bh_lock_sock(sk);
//如果當前沒有用戶空間程序正在從接收隊列接收數據,那么直接將SKB放入到接收隊列中即可
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
else
//如果接收隊列已經被鎖定,那么暫時將數據放入到后備隊列中,后備隊列中的數據在
//release_sock()中被轉移到接收隊列中
sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
return rc;
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
數據包進接收隊列 sk_receive_queue
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int is_udplite = IS_UDPLITE(sk);
int rc;
//調用sock_queue_rcv_skb()接收
if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM) {
//如果由于內存問題導致數據包接收失敗,進行統計
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
atomic_inc(&sk->sk_drops);
}
goto drop;
}
return 0;
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err = 0;
int skb_len;
//如果接收該數據包后,占用內存過大,則接收失敗
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned)sk->sk_rcvbuf) {
err = -ENOMEM;
goto out;
}
//對于設置了套接字過濾器的調用其過濾器回調,過濾失敗直接返回失敗
err = sk_filter(sk, skb);
if (err)
goto out;
//進行內存相關的統計,如果內存不足或者超過了接收緩存上限,則接收失敗
if (!sk_rmem_schedule(sk, skb->truesize)) {
err = -ENOBUFS;
goto out;
}
skb->dev = NULL;
//輸入數據包由該套接字認領
skb_set_owner_r(skb, sk);
/* Cache the SKB length before we tack it onto the receive
* queue. Once it is added it no longer belongs to us and
* may be freed by other threads of control pulling packets
* from the queue.
*/
skb_len = skb->len;
//將該SKB加入到接收隊列中
skb_queue_tail(&sk->sk_receive_queue, skb);
//調用回調通知可能由于數據不足而block的進程
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, skb_len);
out:
return err;
}
喚醒阻塞進程 sock_def_readable(進接收隊列喚醒)
將數據放入接收隊列后,需要喚醒那些因為數據不足而阻塞的進程,這是通過上面的sk->sk_data_ready()回調實現的,對于UDP,該函數就是 sock_def_readable。
static void sock_def_readable(struct sock *sk, int len)
{
//先獲取讀鎖
read_lock(&sk->sk_callback_lock);
//如果有正在阻塞的進程,喚醒它們
if (sk_has_sleeper(sk))
wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
read_unlock(&sk->sk_callback_lock);
}
static inline int sk_has_sleeper(struct sock *sk)
{
/*
* We need to be sure we are in sync with the
* add_wait_queue modifications to the wait queue.
*
* This memory barrier is paired in the sock_poll_wait.
*/
smp_mb__after_lock();
//block的進程都阻塞在了sk->sk_sleep等待隊列上
return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
}
數據包進后備隊列 sk_backlog
在下半部接收時,如果傳輸控制塊已經被進程鎖定,那么會先將數據放入到后備隊列中,等進程釋放傳輸控制塊時再進行處理,這種設計可以使得軟中斷能夠盡快的結束。
/* The per-socket spinlock must be held here. */
//調用該函數時,要確保已經使用自旋鎖sk_lock.slock
static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
//將skb放入后備隊列的末尾
if (!sk->sk_backlog.tail) {
sk->sk_backlog.head = sk->sk_backlog.tail = skb;
} else {
sk->sk_backlog.tail->next = skb;
sk->sk_backlog.tail = skb;
}
skb->next = NULL;
}