基于linux-2.6.31的IPV6的数据包发送及转发流程分析

最新推荐文章于 2023-03-02 16:27:44 发布

gogly

最新推荐文章于 2023-03-02 16:27:44 发布

阅读量5.9k

点赞数

分类专栏： linux协议栈文章标签： dst struct output header network null

linux协议栈专栏收录该内容

13 篇文章 1 订阅

订阅专栏

发送及转发的流程为：dst_out--->ipv6_output----> ipv6_output 2à ipv6_output_finish

或者ip6_forward-àip6_forward_finish-à dst_out --->ipv6_output----> ipv6_output 2à ipv6_output_finish

==###########################################################################==

//如果需要转发数据包，则调用ip6_forward执行转发过程，最后通过ip6_forward_finish函数把数据包交给ipv6模块的ip6_output函数，进入发送流程。

==###########################################################################==

最终生成的IP数据报的路由称为目的入口(dst_entry)，目的入口反映了相邻的外部主机在主机内部的一种“映象”，目的入口在内核中的定义如下

struct dst_entry

{

structrcu_head rcu_head;

structdst_entry *child;

structnet_device *dev;

short error;

short obsolete;

int flags;

#define DST_HOST 1

#define DST_NOXFRM 2

#define DST_NOPOLICY 4

#define DST_NOHASH 8

unsignedlong expires;

unsignedshort header_len; /* more space at head required */

unsignedshort trailer_len; /* space to reserve at tail */

unsignedint rate_tokens;

unsignedlong rate_last; /* rate limiting for ICMP */

structdst_entry *path;

structneighbour *neighbour;

structhh_cache *hh;

#ifdef CONFIG_XFRM

structxfrm_state *xfrm;

#else

void *__pad1;

#endif

int (*input)(structsk_buff*);

int (*output)(structsk_buff*);

struct dst_ops *ops;

u32 metrics[RTAX_MAX];

#ifdef CONFIG_NET_CLS_ROUTE

__u32 tclassid;

#else

__u32 __pad2;

#endif

* Align __refcnt to a 64 bytes alignment

* (L1_CACHE_SIZE would be too much)

#ifdef CONFIG_64BIT

long __pad_to_align_refcnt[2];

#else

long __pad_to_align_refcnt[1];

#endif

* __refcnt wants to be on a different cacheline from

* input/output/ops or performance tanks badly

atomic_t __refcnt; /* client references */

int __use;

unsignedlong lastuse;

union{

structdst_entry *next;

structrtable *rt_next;

structrt6_info *rt6_next;

structdn_route *dn_next;

};

如果接收了转发给其他主机的数据包，则ip6_rcv_finish通过dst_input接口把数据包传递给函数ip6_forward。该函数执行一些检测：确定设备是否支持转发、判断跳数限制是否失效。最后调用ip6_forwart_finish执行转发

int ip6_forward(struct sk_buff *skb)

{

structdst_entry *dst = skb_dst(skb);

structipv6hdr *hdr = ipv6_hdr(skb);

structinet6_skb_parm *opt = IP6CB(skb);

structnet *net = dev_net(dst->dev);

//检测设备是否支持转发IPv6数据包

if(net->ipv6.devconf_all->forwarding == 0)

gotoerror;

if(skb_warn_if_lro(skb))

gotodrop;

//ipsec策略检测

if(!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {

IP6_INC_STATS(net,ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);

gotodrop;

}

static inline void skb_forward_csum(structsk_buff *skb)

{

/*Unfortunately we don't support this one. Any brave souls? */

if(skb->ip_summed == CHECKSUM_COMPLETE)

skb->ip_summed= CHECKSUM_NONE;

}

skb_forward_csum(skb);

* WeDO NOT make any processing on

* RApackets, pushing them to user level AS IS

* withoutane WARRANTY that application will be able

* tointerpret them. The reason is that we

* cannotmake anything clever here.

* Weare not end-node, so that if packet contains

* AH/ESP,we cannot make anything.

* Defragmentationalso would be mistake, RA packets

* cannotbe fragmented, because there is no warranty

* thatdifferent fragments will go along one path. --ANK

*对RA数据包不做处理，提交给用户态。

staticint ip6_call_ra_chain(struct sk_buff *skb, int sel)

{

struct ip6_ra_chain *ra;

struct sock *last = NULL;

read_lock(&ip6_ra_lock);

for (ra = ip6_ra_chain; ra; ra =ra->next) {

struct sock *sk = ra->sk;

if (sk && ra->sel ==sel &&

(!sk->sk_bound_dev_if ||

sk->sk_bound_dev_if ==skb->dev->ifindex)) {

if (last) {

struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);

if (skb2)

rawv6_rcv(last,skb2);

}

last = sk;

}

if (last) {

rawv6_rcv(last, skb);

read_unlock(&ip6_ra_lock);

return 1;

}

read_unlock(&ip6_ra_lock);

return 0;

}

if(opt->ra) {

u8*ptr = skb_network_header(skb) + opt->ra;

if(ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))

return0;

}

//检查和递减TTL

* checkand decrement ttl

//如果跳数限制小于1，则发出icmpv6_time_exceed消息

if(hdr->hop_limit <= 1) {

/*Force OUTPUT device used as source address */

skb->dev= dst->dev;

icmpv6_send(skb,ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,

0, skb->dev);

IP6_INC_STATS_BH(net,

ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);

kfree_skb(skb);

return-ETIMEDOUT;

}

/*XXX: idev->cnf.proxy_ndp? */

if(net->ipv6.devconf_all->proxy_ndp &&

pneigh_lookup(&nd_tbl, net,&hdr->daddr, skb->dev, 0)) {

intproxied = ip6_forward_proxy_check(skb);

if(proxied > 0)

returnip6_input(skb);

elseif (proxied < 0) {

IP6_INC_STATS(net,ip6_dst_idev(dst),

IPSTATS_MIB_INDISCARDS);

gotodrop;

}

//ipsec路由转发

if(!xfrm6_route_forward(skb)) {

IP6_INC_STATS(net,ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);

gotodrop;

}

dst= skb_dst(skb);

/*IPv6 specs 规格say nothing about it, but it is clear that we cannot

send redirects to source routed frames.

We don't send redirects to framesdecapsulated拆分 from IPsec.

if(skb->dev == dst->dev && dst->neighbour &&opt->srcrt == 0 &&

!skb_sec_path(skb)) {

structin6_addr *target = NULL;

structrt6_info *rt;

structneighbour *n = dst->neighbour;

* incomingand outgoing devices are the same

* senda redirect.

rt= (struct rt6_info *) dst;

if((rt->rt6i_flags & RTF_GATEWAY))

target= (struct in6_addr*)&n->primary_key;

else

target= &hdr->daddr;

/*Limit redirects both by destination (here)

and by source (inside ndisc_send_redirect)

if(xrlim_allow(dst, 1*HZ))

ndisc_send_redirect(skb,n, target);

}else {

intaddrtype = ipv6_addr_type(&hdr->saddr);

//丢弃源地址是多播、环回和本地链路类型的数据包

/*This check is security critical. */

if(addrtype == IPV6_ADDR_ANY ||

addrtype & (IPV6_ADDR_MULTICAST |IPV6_ADDR_LOOPBACK))

gotoerror;

if(addrtype & IPV6_ADDR_LINKLOCAL) {

icmpv6_send(skb,ICMPV6_DEST_UNREACH,

ICMPV6_NOT_NEIGHBOUR,0, skb->dev);

gotoerror;

}

//如果数据包长度大于MTU，发送ICMPV6_PKT_TOOBIG消息

if(skb->len > dst_mtu(dst)) {

/*Again, force OUTPUT device used as source address */

skb->dev= dst->dev;

icmpv6_send(skb,ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);

IP6_INC_STATS_BH(net,

ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);

IP6_INC_STATS_BH(net,

ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);

kfree_skb(skb);

return-EMSGSIZE;

}

//一般而言，skb通过引用计数实现共享，前提是大家不能修改skb head 和data的内容。如果需要修改的话，就有必要调用skb_cow重新申请一个啦

if(skb_cow(skb, dst->dev->hard_header_len)) {

IP6_INC_STATS(net,ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);

gotodrop;

}

//获取ip头部

hdr= ipv6_hdr(skb);

/*Mangling hops number delayed to point after skb COW */

//跳数限制减一

hdr->hop_limit--;

IP6_INC_STATS_BH(net,ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);

//调用ip6_forward_finish完成转发最后的操作

returnNF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,

ip6_forward_finish);

error:

IP6_INC_STATS_BH(net,ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);

drop:

kfree_skb(skb);

return-EINVAL;

}

static inline int ip6_forward_finish(structsk_buff *skb)

{

returndst_output(skb);

}

/* Output packet to network fromtransport. */

static inline int dst_output(struct sk_buff*skb)

{

returnskb_dst(skb)->output(skb);

}

==#################################################################====

数据包发送流程

Dst_output是由路由项注册的外出函数，指向ip6_output

staticinline int dst_output(struct sk_buff *skb)

{

returnskb_dst(skb)->output(skb);

}

intip6_output(struct sk_buff *skb)

{

structinet6_dev *idev = ip6_dst_idev(skb_dst(skb));

if(unlikely(idev->cnf.disable_ipv6)) {

IP6_INC_STATS(dev_net(skb_dst(skb)->dev),idev,

IPSTATS_MIB_OUTDISCARDS);

kfree_skb(skb);

return0;

}

//如果需要分片，调用ip6_fragment函数处理

if((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||

dst_allfrag(skb_dst(skb)))

returnip6_fragment(skb, ip6_output2);

else

returnip6_output2(skb);

}

staticint ip6_output2(struct sk_buff *skb)

{

structdst_entry *dst = skb_dst(skb);

structnet_device *dev = dst->dev;

// 把数据包的类型设置为IPv6类型

skb->protocol= htons(ETH_P_IPV6);

skb->dev= dev;

//检查是否为多播地址

if(ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {

//sk_buff->sk这是一个指向拥有这个sk_buff的sock结构的指针。这个指针在网络包由本机发出或者由本机进程接收时有效，因为插口相关的信息被L4(TCP或 UDP)或者用户空间程序使用。如果sk_buff只在转发中使用(这意味着，源地址和目的地址都不是本机地址)，这个指针是NULL

structipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;

structinet6_dev *idev = ip6_dst_idev(skb_dst(skb));

if(!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop)&&

((mroute6_socket(dev_net(dev)) &&

!(IP6CB(skb)->flags &IP6SKB_FORWARDED)) ||

ipv6_chk_mcast_addr(dev,&ipv6_hdr(skb)->daddr,

&ipv6_hdr(skb)->saddr))) {

structsk_buff *newskb = skb_clone(skb, GFP_ATOMIC);

/*Do not check for IFF_ALLMULTI; multicast routing

is not supported in any case.

if(newskb)

//调用ip6_dev_loopback_xmit环回发送数据包

NF_HOOK(PF_INET6,NF_INET_POST_ROUTING, newskb,

NULL,newskb->dev,

ip6_dev_loopback_xmit);

if(ipv6_hdr(skb)->hop_limit == 0) {

IP6_INC_STATS(dev_net(dev),idev,

IPSTATS_MIB_OUTDISCARDS);

kfree_skb(skb);

return0;

}

IP6_UPD_PO_STATS(dev_net(dev),idev, IPSTATS_MIB_OUTMCAST,

skb->len);

}

//调用ip6_output_finish进一步处理数据包

returnNF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,

ip6_output_finish);

}

staticint ip6_output_finish(struct sk_buff *skb)

{

/*dst_entry可以理解为路由表的缓冲区,每次主机发送数据时询问路由表后,都会将记录记在一个cache内.dst中有能指向其neighbour的指针,通过neighbour可以找到下一跳地址*/

structdst_entry *dst = skb_dst(skb);

//如果有缓存指针hh，则通过neigh_hh_output发送数据；否则通过dst->neighbour->output发送数据；hh_cache中存储的是链路头的一些相关信息,可以加快数据包的传输(因为有些情况下不用查看路由表,直接到此缓冲区查看).*/

if(dst->hh)

returnneigh_hh_output(dst->hh, skb);

elseif (dst->neighbour)

returndst->neighbour->output(skb);

IP6_INC_STATS_BH(dev_net(dst->dev),

ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);

kfree_skb(skb);

return-EINVAL;

}

staticinline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)

{

unsignedseq;

inthh_len;

do{

inthh_alen;

static __always_inline unsigned read_seqbegin(constseqlock_t *sl)

{

unsignedret;

repeat:

ret =sl->sequence;

smp_rmb();

if(unlikely(ret & 1)) {

cpu_relax();

gotorepeat;

}

returnret;

}

seq= read_seqbegin(&hh->hh_lock);

hh_len= hh->hh_len;

hh_alen= HH_DATA_ALIGN(hh_len);

//将缓冲区数据拷贝到skb中

memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);

}while (read_seqretry(&hh->hh_lock, seq));

skb_push(skb,hh_len);

returnhh->hh_output(skb);

}

unsignedchar *skb_push(struct sk_buff *skb, unsigned int len)

{

skb->data-= len;

skb->len += len;

if(unlikely(skb->data<skb->head))

skb_under_panic(skb, len,__builtin_return_address(0));

returnskb->data;

}

==###################################################################==

UDP发送到IP层的函数

intip6_push_pending_frames(struct sock *sk)

{

structsk_buff *skb, *tmp_skb;

structsk_buff **tail_skb;

structin6_addr final_dst_buf, *final_dst = &final_dst_buf;

structinet_sock *inet = inet_sk(sk);

structipv6_pinfo *np = inet6_sk(sk);

structnet *net = sock_net(sk);

structipv6hdr *hdr;

structipv6_txoptions *opt = np->cork.opt;

structrt6_info *rt = (struct rt6_info *)inet->cork.dst;

structflowi *fl = &inet->cork.fl;

unsignedchar proto = fl->proto;

interr = 0;

//检查发送队列是否为空，并返回队首的套接字缓冲区skb

if((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)

gotoout;

tail_skb= &(skb_shinfo(skb)->frag_list);

//如果有扩展头部信息，则调整skb->data指向IP包头部

/*move skb->data to ip header from ext header */

if(skb->data < skb_network_header(skb))

__skb_pull(skb,skb_network_offset(skb));

//遍历套接字发送队列，调整数据长度

while((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {

__skb_pull(tmp_skb,skb_network_header_len(skb));

*tail_skb= tmp_skb;

tail_skb= &(tmp_skb->next);

skb->len+= tmp_skb->len;

skb->data_len+= tmp_skb->len;

skb->truesize+= tmp_skb->truesize;

tmp_skb->destructor= NULL;

tmp_skb->sk= NULL;

}

/*Allow local fragmentation. */

if(np->pmtudisc < IPV6_PMTUDISC_DO)

skb->local_df= 1;

ipv6_addr_copy(final_dst,&fl->fl6_dst);

__skb_pull(skb,skb_network_header_len(skb));

//填充ipv6的扩展头部

if(opt && opt->opt_flen)

ipv6_push_frag_opts(skb,opt, &proto);

if(opt && opt->opt_nflen)

ipv6_push_nfrag_opts(skb,opt, &proto, &final_dst);

//记录IPv6头部起始位置

skb_push(skb,sizeof(struct ipv6hdr));

skb_reset_network_header(skb);

hdr= ipv6_hdr(skb);

//设置流标签

*(__be32*)hdr= fl->fl6_flowlabel |

htonl(0x60000000 |((int)np->cork.tclass << 20));

//设置跳数限制

hdr->hop_limit= np->cork.hop_limit;

//设置下一个包头

hdr->nexthdr= proto;

//为ipv6设置地址

ipv6_addr_copy(&hdr->saddr,&fl->fl6_src);

ipv6_addr_copy(&hdr->daddr,final_dst);

//设置属性和子网掩码

skb->priority= sk->sk_priority;

skb->mark= sk->sk_mark;

//给套接字缓冲区skb指定路由表项信息；为数据包的进入ipv6发送流程设置具体的方法

skb_dst_set(skb,dst_clone(&rt->u.dst));

IP6_UPD_PO_STATS(net,rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);

if(proto == IPPROTO_ICMPV6) {

structinet6_dev *idev = ip6_dst_idev(skb_dst(skb));

ICMP6MSGOUT_INC_STATS_BH(net,idev, icmp6_hdr(skb)->icmp6_type);

ICMP6_INC_STATS_BH(net,idev, ICMP6_MIB_OUTMSGS);

}

//程序执行到这里，已经为dst_output配置完skb的处理信息，内核将从这里跳转到dst_output函数，通过ip6_output函数进入ipv6流程

err= ip6_local_out(skb);

if(err) {

if(err > 0)

err= np->recverr ? net_xmit_errno(err) : 0;

if(err)

gotoerror;

}

out:

ip6_cork_release(inet,np);

returnerr;

error:

gotoout;

}

intip6_local_out(struct sk_buff *skb)

{

interr;

err= __ip6_local_out(skb);

if(likely(err == 1))

err= dst_output(skb);

returnerr;

}

int__ip6_local_out(struct sk_buff *skb)

{

intlen;

len= skb->len - sizeof(struct ipv6hdr);

if(len > IPV6_MAXPLEN)

len= 0;

// 设置载荷长度为0；unsigned short payload_len; //载荷长度16位

ipv6_hdr(skb)->payload_len= htons(len);

returnnf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,

dst_output);

}

==###################################################################==

TCP发送到IP层的函数

intip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,

struct ipv6_txoptions *opt, int ipfragok)

{

structnet *net = sock_net(sk);

structipv6_pinfo *np = inet6_sk(sk);

structin6_addr *first_hop = &fl->fl6_dst;

structdst_entry *dst = skb_dst(skb);

structipv6hdr *hdr;

u8 proto = fl->proto;

intseg_len = skb->len;

inthlimit, tclass;

u32mtu;

//如果需要填充ipv6扩展头部，则调整存储头部空间

if(opt) {

unsignedint head_room;

/*First: exthdrs may take lots of space (~8K for now)

MAX_HEADER is not enough.

head_room= opt->opt_nflen + opt->opt_flen;

seg_len+= head_room;

head_room+= sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);

if(skb_headroom(skb) < head_room) {

structsk_buff *skb2 = skb_realloc_headroom(skb, head_room);

if(skb2 == NULL) {

IP6_INC_STATS(net,ip6_dst_idev(skb_dst(skb)),

IPSTATS_MIB_OUTDISCARDS);

kfree_skb(skb);

return-ENOBUFS;

}

kfree_skb(skb);

skb= skb2;

if(sk)

skb_set_owner_w(skb,sk);

}

//填充IPv6的扩展头部信息

if(opt->opt_flen)

ipv6_push_frag_opts(skb,opt, &proto);

if(opt->opt_nflen)

ipv6_push_nfrag_opts(skb,opt, &proto, &first_hop);

}

//记录ipv6头部的起始位置

skb_push(skb,sizeof(struct ipv6hdr));

skb_reset_network_header(skb);

hdr= ipv6_hdr(skb);

//设置分片运行标志

/*Allow local fragmentation. */

if(ipfragok)

skb->local_df= 1;

* Fillin the IPv6 header

//计算跳转限制

hlimit= -1;

if(np)

hlimit= np->hop_limit;

if(hlimit < 0)

hlimit= ip6_dst_hoplimit(dst);

tclass= -1;

if(np)

tclass= np->tclass;

if(tclass < 0)

tclass= 0;

//设置流标签

*(__be32*)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;

//设置载荷长度，下一个扩展头协议，跳转限制

hdr->payload_len= htons(seg_len);

hdr->nexthdr= proto;

hdr->hop_limit= hlimit;

//设置ipv6头部得ip地址，属性，子网掩码

ipv6_addr_copy(&hdr->saddr,&fl->fl6_src);

ipv6_addr_copy(&hdr->daddr,first_hop);

skb->priority= sk->sk_priority;

skb->mark= sk->sk_mark;

//考虑MTU值，如果包太大，就要发送ICMPV6_PKT_TOOBIG消息

mtu= dst_mtu(dst);

if((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {

IP6_UPD_PO_STATS(net,ip6_dst_idev(skb_dst(skb)),

IPSTATS_MIB_OUT, skb->len);

returnNF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,

dst_output);

}

if(net_ratelimit())

printk(KERN_DEBUG"IPv6: sending pkt_too_big to self\n");

skb->dev= dst->dev;

icmpv6_send(skb,ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);

IP6_INC_STATS(net,ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);

kfree_skb(skb);

return-EMSGSIZE;

}

gogly

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录