发送及转发的流程为:dst_out--->ipv6_output----> ipv6_output 2à ipv6_output_finish
或者ip6_forward-àip6_forward_finish-à dst_out --->ipv6_output----> ipv6_output 2à ipv6_output_finish
==###########################################################################==
//如果需要转发数据包,则调用ip6_forward执行转发过程,最后通过ip6_forward_finish函数把数据包交给ipv6模块的ip6_output函数,进入发送流程。
==###########################################################################==
最终生成的IP数据报的路由称为目的入口(dst_entry),目的入口反映了相邻的外部主机在主机内部的一种“映象”,目的入口在内核中的定义如下
struct dst_entry
{
structrcu_head rcu_head;
structdst_entry *child;
structnet_device *dev;
short error;
short obsolete;
int flags;
#define DST_HOST 1
#define DST_NOXFRM 2
#define DST_NOPOLICY 4
#define DST_NOHASH 8
unsignedlong expires;
unsignedshort header_len; /* more space at head required */
unsignedshort trailer_len; /* space to reserve at tail */
unsignedint rate_tokens;
unsignedlong rate_last; /* rate limiting for ICMP */
structdst_entry *path;
structneighbour *neighbour;
structhh_cache *hh;
#ifdef CONFIG_XFRM
structxfrm_state *xfrm;
#else
void *__pad1;
#endif
int (*input)(structsk_buff*);
int (*output)(structsk_buff*);
struct dst_ops *ops;
u32 metrics[RTAX_MAX];
#ifdef CONFIG_NET_CLS_ROUTE
__u32 tclassid;
#else
__u32 __pad2;
#endif
/*
* Align __refcnt to a 64 bytes alignment
* (L1_CACHE_SIZE would be too much)
*/
#ifdef CONFIG_64BIT
long __pad_to_align_refcnt[2];
#else
long __pad_to_align_refcnt[1];
#endif
/*
* __refcnt wants to be on a different cacheline from
* input/output/ops or performance tanks badly
*/
atomic_t __refcnt; /* client references */
int __use;
unsignedlong lastuse;
union{
structdst_entry *next;
structrtable *rt_next;
structrt6_info *rt6_next;
structdn_route *dn_next;
};
};
如果接收了转发给其他主机的数据包,则ip6_rcv_finish通过dst_input接口把数据包传递给函数ip6_forward。该函数执行一些检测:确定设备是否支持转发、判断跳数限制是否失效。最后调用ip6_forwart_finish执行转发
int ip6_forward(struct sk_buff *skb)
{
structdst_entry *dst = skb_dst(skb);
structipv6hdr *hdr = ipv6_hdr(skb);
structinet6_skb_parm *opt = IP6CB(skb);
structnet *net = dev_net(dst->dev);
//检测设备是否支持转发IPv6数据包
if(net->ipv6.devconf_all->forwarding == 0)
gotoerror;
if(skb_warn_if_lro(skb))
gotodrop;
//ipsec策略检测
if(!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
IP6_INC_STATS(net,ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
gotodrop;
}
/*
static inline void skb_forward_csum(structsk_buff *skb)
{
/*Unfortunately we don't support this one. Any brave souls? */
if(skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed= CHECKSUM_NONE;
}
*/
skb_forward_csum(skb);
/*
* WeDO NOT make any processing on
* RApackets, pushing them to user level AS IS
* withoutane WARRANTY that application will be able
* tointerpret them. The reason is that we
* cannotmake anything clever here.
*
* Weare not end-node, so that if packet contains
* AH/ESP,we cannot make anything.
* Defragmentationalso would be mistake, RA packets
* cannotbe fragmented, because there is no warranty
* thatdifferent fragments will go along one path. --ANK
*对RA数据包不做处理,提交给用户态。
/
/*
staticint ip6_call_ra_chain(struct sk_buff *skb, int sel)
{
struct ip6_ra_chain *ra;
struct sock *last = NULL;
read_lock(&ip6_ra_lock);
for (ra = ip6_ra_chain; ra; ra =ra->next) {
struct sock *sk = ra->sk;
if (sk && ra->sel ==sel &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if ==skb->dev->ifindex)) {
if (last) {
struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
if (skb2)
rawv6_rcv(last,skb2);
}
last = sk;
}
}
if (last) {
rawv6_rcv(last, skb);
read_unlock(&ip6_ra_lock);
return 1;
}
read_unlock(&ip6_ra_lock);
return 0;
}
*/
if(opt->ra) {
u8*ptr = skb_network_header(skb) + opt->ra;
if(ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
return0;
}
//检查和递减TTL
/*
* checkand decrement ttl
*/
//如果跳数限制小于1,则发出icmpv6_time_exceed消息
if(hdr->hop_limit <= 1) {
/*Force OUTPUT device used as source address */
skb->dev= dst->dev;
icmpv6_send(skb,ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
0, skb->dev);
IP6_INC_STATS_BH(net,
ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return-ETIMEDOUT;
}
/*XXX: idev->cnf.proxy_ndp? */
if(net->ipv6.devconf_all->proxy_ndp &&
pneigh_lookup(&nd_tbl, net,&hdr->daddr, skb->dev, 0)) {
intproxied = ip6_forward_proxy_check(skb);
if(proxied > 0)
returnip6_input(skb);
elseif (proxied < 0) {
IP6_INC_STATS(net,ip6_dst_idev(dst),
IPSTATS_MIB_INDISCARDS);
gotodrop;
}
}
//ipsec路由转发
if(!xfrm6_route_forward(skb)) {
IP6_INC_STATS(net,ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
gotodrop;
}
dst= skb_dst(skb);
/*IPv6 specs 规格say nothing about it, but it is clear that we cannot
send redirects to source routed frames.
We don't send redirects to framesdecapsulated拆分 from IPsec.
*/
if(skb->dev == dst->dev && dst->neighbour &&opt->srcrt == 0 &&
!skb_sec_path(skb)) {
structin6_addr *target = NULL;
structrt6_info *rt;
structneighbour *n = dst->neighbour;
/*
* incomingand outgoing devices are the same
* senda redirect.
*/
rt= (struct rt6_info *) dst;
if((rt->rt6i_flags & RTF_GATEWAY))
target= (struct in6_addr*)&n->primary_key;
else
target= &hdr->daddr;
/*Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
*/
if(xrlim_allow(dst, 1*HZ))
ndisc_send_redirect(skb,n, target);
}else {
intaddrtype = ipv6_addr_type(&hdr->saddr);
//丢弃源地址是多播、环回和本地链路类型的数据包
/*This check is security critical. */
if(addrtype == IPV6_ADDR_ANY ||
addrtype & (IPV6_ADDR_MULTICAST |IPV6_ADDR_LOOPBACK))
gotoerror;
if(addrtype & IPV6_ADDR_LINKLOCAL) {
icmpv6_send(skb,ICMPV6_DEST_UNREACH,
ICMPV6_NOT_NEIGHBOUR,0, skb->dev);
gotoerror;
}
}
//如果数据包长度大于MTU,发送ICMPV6_PKT_TOOBIG消息
if(skb->len > dst_mtu(dst)) {
/*Again, force OUTPUT device used as source address */
skb->dev= dst->dev;
icmpv6_send(skb,ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
IP6_INC_STATS_BH(net,
ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
IP6_INC_STATS_BH(net,
ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
return-EMSGSIZE;
}
//一般而言,skb通过引用计数实现共享,前提是大家不能修改skb head 和data的内容。 如果需要修改的话,就有必要调用skb_cow重新申请一个啦
if(skb_cow(skb, dst->dev->hard_header_len)) {
IP6_INC_STATS(net,ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
gotodrop;
}
//获取ip头部
hdr= ipv6_hdr(skb);
/*Mangling hops number delayed to point after skb COW */
//跳数限制减一
hdr->hop_limit--;
IP6_INC_STATS_BH(net,ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
//调用ip6_forward_finish完成转发最后的操作
returnNF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
ip6_forward_finish);
error:
IP6_INC_STATS_BH(net,ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
drop:
kfree_skb(skb);
return-EINVAL;
}
static inline int ip6_forward_finish(structsk_buff *skb)
{
returndst_output(skb);
}
/* Output packet to network fromtransport. */
static inline int dst_output(struct sk_buff*skb)
{
returnskb_dst(skb)->output(skb);
}
==#################################################################====
数据包发送流程
Dst_output是由路由项注册的外出函数,指向ip6_output
staticinline int dst_output(struct sk_buff *skb)
{
returnskb_dst(skb)->output(skb);
}
intip6_output(struct sk_buff *skb)
{
structinet6_dev *idev = ip6_dst_idev(skb_dst(skb));
if(unlikely(idev->cnf.disable_ipv6)) {
IP6_INC_STATS(dev_net(skb_dst(skb)->dev),idev,
IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
return0;
}
//如果需要分片,调用ip6_fragment函数处理
if((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)))
returnip6_fragment(skb, ip6_output2);
else
returnip6_output2(skb);
}
staticint ip6_output2(struct sk_buff *skb)
{
structdst_entry *dst = skb_dst(skb);
structnet_device *dev = dst->dev;
// 把数据包的类型设置为IPv6类型
skb->protocol= htons(ETH_P_IPV6);
skb->dev= dev;
//检查是否为多播地址
if(ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
//sk_buff->sk这是一个指向拥有这个sk_buff的sock结构的指针。这个指针在网络包由本机发出或者由本机进程接收时有效,因为插口相关的信息被L4(TCP或 UDP)或者用户空间程序使用。如果sk_buff只在转发中使用(这意味着,源地址和目的地址都不是本机地址),这个指针是NULL
structipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
structinet6_dev *idev = ip6_dst_idev(skb_dst(skb));
if(!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop)&&
((mroute6_socket(dev_net(dev)) &&
!(IP6CB(skb)->flags &IP6SKB_FORWARDED)) ||
ipv6_chk_mcast_addr(dev,&ipv6_hdr(skb)->daddr,
&ipv6_hdr(skb)->saddr))) {
structsk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
/*Do not check for IFF_ALLMULTI; multicast routing
is not supported in any case.
*/
if(newskb)
//调用ip6_dev_loopback_xmit环回发送数据包
NF_HOOK(PF_INET6,NF_INET_POST_ROUTING, newskb,
NULL,newskb->dev,
ip6_dev_loopback_xmit);
if(ipv6_hdr(skb)->hop_limit == 0) {
IP6_INC_STATS(dev_net(dev),idev,
IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
return0;
}
}
IP6_UPD_PO_STATS(dev_net(dev),idev, IPSTATS_MIB_OUTMCAST,
skb->len);
}
//调用ip6_output_finish进一步处理数据包
returnNF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
ip6_output_finish);
}
staticint ip6_output_finish(struct sk_buff *skb)
{
/*dst_entry可以理解为路由表的缓冲区,每次主机发送数据时询问路由表后,都会将记录记在一个cache内.dst中有能指向其neighbour的指针,通过neighbour可以找到下一跳地址*/
structdst_entry *dst = skb_dst(skb);
//如果有缓存指针hh,则通过neigh_hh_output发送数据;否则通过dst->neighbour->output发送数据;hh_cache中存储的是链路头的一些相关信息,可以加快数据包的传输(因为有些情况下不用查看路由表,直接到此缓冲区查看).*/
if(dst->hh)
returnneigh_hh_output(dst->hh, skb);
elseif (dst->neighbour)
returndst->neighbour->output(skb);
IP6_INC_STATS_BH(dev_net(dst->dev),
ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return-EINVAL;
}
staticinline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
{
unsignedseq;
inthh_len;
do{
inthh_alen;
/*
static __always_inline unsigned read_seqbegin(constseqlock_t *sl)
{
unsignedret;
repeat:
ret =sl->sequence;
smp_rmb();
if(unlikely(ret & 1)) {
cpu_relax();
gotorepeat;
}
returnret;
}
*/
seq= read_seqbegin(&hh->hh_lock);
hh_len= hh->hh_len;
hh_alen= HH_DATA_ALIGN(hh_len);
//将缓冲区数据拷贝到skb中
memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
}while (read_seqretry(&hh->hh_lock, seq));
skb_push(skb,hh_len);
returnhh->hh_output(skb);
}
unsignedchar *skb_push(struct sk_buff *skb, unsigned int len)
{
skb->data-= len;
skb->len += len;
if(unlikely(skb->data<skb->head))
skb_under_panic(skb, len,__builtin_return_address(0));
returnskb->data;
}
==###################################################################==
UDP发送到IP层的函数
intip6_push_pending_frames(struct sock *sk)
{
structsk_buff *skb, *tmp_skb;
structsk_buff **tail_skb;
structin6_addr final_dst_buf, *final_dst = &final_dst_buf;
structinet_sock *inet = inet_sk(sk);
structipv6_pinfo *np = inet6_sk(sk);
structnet *net = sock_net(sk);
structipv6hdr *hdr;
structipv6_txoptions *opt = np->cork.opt;
structrt6_info *rt = (struct rt6_info *)inet->cork.dst;
structflowi *fl = &inet->cork.fl;
unsignedchar proto = fl->proto;
interr = 0;
//检查发送队列是否为空,并返回队首的套接字缓冲区skb
if((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
gotoout;
tail_skb= &(skb_shinfo(skb)->frag_list);
//如果有扩展头部信息,则调整skb->data指向IP包头部
/*move skb->data to ip header from ext header */
if(skb->data < skb_network_header(skb))
__skb_pull(skb,skb_network_offset(skb));
//遍历套接字发送队列,调整数据长度
while((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
__skb_pull(tmp_skb,skb_network_header_len(skb));
*tail_skb= tmp_skb;
tail_skb= &(tmp_skb->next);
skb->len+= tmp_skb->len;
skb->data_len+= tmp_skb->len;
skb->truesize+= tmp_skb->truesize;
tmp_skb->destructor= NULL;
tmp_skb->sk= NULL;
}
/*Allow local fragmentation. */
if(np->pmtudisc < IPV6_PMTUDISC_DO)
skb->local_df= 1;
ipv6_addr_copy(final_dst,&fl->fl6_dst);
__skb_pull(skb,skb_network_header_len(skb));
//填充ipv6的扩展头部
if(opt && opt->opt_flen)
ipv6_push_frag_opts(skb,opt, &proto);
if(opt && opt->opt_nflen)
ipv6_push_nfrag_opts(skb,opt, &proto, &final_dst);
//记录IPv6头部起始位置
skb_push(skb,sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
hdr= ipv6_hdr(skb);
//设置流标签
*(__be32*)hdr= fl->fl6_flowlabel |
htonl(0x60000000 |((int)np->cork.tclass << 20));
//设置跳数限制
hdr->hop_limit= np->cork.hop_limit;
//设置下一个包头
hdr->nexthdr= proto;
//为ipv6设置地址
ipv6_addr_copy(&hdr->saddr,&fl->fl6_src);
ipv6_addr_copy(&hdr->daddr,final_dst);
//设置属性和子网掩码
skb->priority= sk->sk_priority;
skb->mark= sk->sk_mark;
//给套接字缓冲区skb指定路由表项信息;为数据包的进入ipv6发送流程设置具体的方法
skb_dst_set(skb,dst_clone(&rt->u.dst));
IP6_UPD_PO_STATS(net,rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
if(proto == IPPROTO_ICMPV6) {
structinet6_dev *idev = ip6_dst_idev(skb_dst(skb));
ICMP6MSGOUT_INC_STATS_BH(net,idev, icmp6_hdr(skb)->icmp6_type);
ICMP6_INC_STATS_BH(net,idev, ICMP6_MIB_OUTMSGS);
}
//程序执行到这里,已经为dst_output配置完skb的处理信息,内核将从这里跳转到dst_output函数,通过ip6_output函数进入ipv6流程
err= ip6_local_out(skb);
if(err) {
if(err > 0)
err= np->recverr ? net_xmit_errno(err) : 0;
if(err)
gotoerror;
}
out:
ip6_cork_release(inet,np);
returnerr;
error:
gotoout;
}
intip6_local_out(struct sk_buff *skb)
{
interr;
err= __ip6_local_out(skb);
if(likely(err == 1))
err= dst_output(skb);
returnerr;
}
int__ip6_local_out(struct sk_buff *skb)
{
intlen;
len= skb->len - sizeof(struct ipv6hdr);
if(len > IPV6_MAXPLEN)
len= 0;
// 设置载荷长度为0;unsigned short payload_len; //载荷长度16位
ipv6_hdr(skb)->payload_len= htons(len);
returnnf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
dst_output);
}
==###################################################################==
TCP发送到IP层的函数
intip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
struct ipv6_txoptions *opt, int ipfragok)
{
structnet *net = sock_net(sk);
structipv6_pinfo *np = inet6_sk(sk);
structin6_addr *first_hop = &fl->fl6_dst;
structdst_entry *dst = skb_dst(skb);
structipv6hdr *hdr;
u8 proto = fl->proto;
intseg_len = skb->len;
inthlimit, tclass;
u32mtu;
//如果需要填充ipv6扩展头部,则调整存储头部空间
if(opt) {
unsignedint head_room;
/*First: exthdrs may take lots of space (~8K for now)
MAX_HEADER is not enough.
*/
head_room= opt->opt_nflen + opt->opt_flen;
seg_len+= head_room;
head_room+= sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
if(skb_headroom(skb) < head_room) {
structsk_buff *skb2 = skb_realloc_headroom(skb, head_room);
if(skb2 == NULL) {
IP6_INC_STATS(net,ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
return-ENOBUFS;
}
kfree_skb(skb);
skb= skb2;
if(sk)
skb_set_owner_w(skb,sk);
}
//填充IPv6的扩展头部信息
if(opt->opt_flen)
ipv6_push_frag_opts(skb,opt, &proto);
if(opt->opt_nflen)
ipv6_push_nfrag_opts(skb,opt, &proto, &first_hop);
}
//记录ipv6头部的起始位置
skb_push(skb,sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
hdr= ipv6_hdr(skb);
//设置分片运行标志
/*Allow local fragmentation. */
if(ipfragok)
skb->local_df= 1;
/*
* Fillin the IPv6 header
*/
//计算跳转限制
hlimit= -1;
if(np)
hlimit= np->hop_limit;
if(hlimit < 0)
hlimit= ip6_dst_hoplimit(dst);
tclass= -1;
if(np)
tclass= np->tclass;
if(tclass < 0)
tclass= 0;
//设置流标签
*(__be32*)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
//设置载荷长度,下一个扩展头协议,跳转限制
hdr->payload_len= htons(seg_len);
hdr->nexthdr= proto;
hdr->hop_limit= hlimit;
//设置ipv6头部得ip地址,属性,子网掩码
ipv6_addr_copy(&hdr->saddr,&fl->fl6_src);
ipv6_addr_copy(&hdr->daddr,first_hop);
skb->priority= sk->sk_priority;
skb->mark= sk->sk_mark;
//考虑MTU值,如果包太大,就要发送ICMPV6_PKT_TOOBIG消息
mtu= dst_mtu(dst);
if((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
IP6_UPD_PO_STATS(net,ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_OUT, skb->len);
returnNF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
dst_output);
}
if(net_ratelimit())
printk(KERN_DEBUG"IPv6: sending pkt_too_big to self\n");
skb->dev= dst->dev;
icmpv6_send(skb,ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
IP6_INC_STATS(net,ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
return-EMSGSIZE;
}