staticunsignedintip_vs_in(unsignedint hooknum,struct sk_buff *skb,conststruct net_device *in,conststruct net_device *out,int(*okfn)(struct sk_buff *)){struct ip_vs_iphdr iph;struct ip_vs_protocol *pp;struct ip_vs_conn *cp;int ret, restart, af, pkts;int v = NF_DROP;int res_dir;
af =(skb->protocol ==htons(ETH_P_IP))? AF_INET : AF_INET6;ip_vs_fill_iphdr(af,skb_network_header(skb),&iph);/*
* Big tappo: only PACKET_HOST, including loopback for local client
* Don't handle local packets on IPv6 for now
*/if(unlikely(skb->pkt_type != PACKET_HOST)){IP_VS_DBG_BUF(12,"packet type=%d proto=%d daddr=%s ignored\n",
skb->pkt_type,
iph.protocol,IP_VS_DBG_ADDR(af,&iph.daddr));return NF_ACCEPT;}#ifdef CONFIG_IP_VS_IPV6if(af == AF_INET6){if(unlikely(iph.protocol == IPPROTO_ICMPV6)){int related, verdict =ip_vs_in_icmp_v6(skb,&related, hooknum);if(related)return verdict;ip_vs_fill_iphdr(af,skb_network_header(skb),&iph);}}else#endifif(unlikely(iph.protocol == IPPROTO_ICMP)){int related, verdict =ip_vs_in_icmp(skb,&related, hooknum);if(related)return verdict;ip_vs_fill_iphdr(af,skb_network_header(skb),&iph);}/* Protocol supported? */
pp =ip_vs_proto_get(iph.protocol);// TCP为ip_vs_protocol_tcpif(unlikely(!pp))return NF_ACCEPT;/*
* Check if the packet belongs to an existing connection entry
*/
cp = pp->conn_in_get(af, skb, pp,&iph, iph.len,0,&res_dir);// 查找连接,TCP为tcp_conn_in_get()if(likely(cp)){/* For full-nat/local-client packets, it could be a response */if(res_dir == IP_VS_CIDX_F_IN2OUT){returnhandle_response(af, skb, pp, cp, iph.len);}}else{/* create a new connection */int v;if(!pp->conn_schedule(af, skb, pp,&v,&cp))// 调度RS并创建连接,TCP为tcp_conn_schedule()return v;}if(unlikely(!cp)){/* sorry, all this trouble for a no-hit :) */IP_VS_DBG_PKT(12, pp, skb,0,"packet continues traversal as normal");return NF_ACCEPT;}IP_VS_DBG_PKT(11, pp, skb,0,"Incoming packet");/* Check the server status */if(cp->dest &&!(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)){/* the destination server is not available */if(sysctl_ip_vs_expire_nodest_conn){/* try to expire the connection immediately */ip_vs_conn_expire_now(cp);}/* don't restart its timer, and silently
drop the packet. */__ip_vs_conn_put(cp);return NF_DROP;}ip_vs_in_stats(cp, skb);/*
* Filter out-in ack packet when cp is at SYN_SENT state.
* DROP it if not a valid packet, STORE it if we have
* space left.
*/if((cp->flags & IP_VS_CONN_F_SYNPROXY)&&(0==ip_vs_synproxy_filter_ack(skb, cp, pp,&iph,&v))){ip_vs_conn_put(cp);return v;}/*
* "Reuse" syn-proxy sessions.
* "Reuse" means update syn_proxy_seq struct and clean ack_skb etc.
*/if((cp->flags & IP_VS_CONN_F_SYNPROXY)&&(0!= sysctl_ip_vs_synproxy_conn_reuse)){int v = NF_DROP;if(0==ip_vs_synproxy_reuse_conn(af, skb, cp, pp,&iph,&v)){ip_vs_conn_put(cp);return v;}}
restart =ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);if(cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp);// Full-NAT为ip_vs_fnat_xmit()/* do not touch skb anymore */else{IP_VS_DBG_RL("warning: packet_xmit is null");
ret = NF_ACCEPT;}/* Increase its packet counter and check if it is needed
* to be synchronized
*
* Sync connection if it is about to close to
* encorage the standby servers to update the connections timeout
*/
pkts =atomic_add_return(1,&cp->in_pkts);if(af == AF_INET &&(ip_vs_sync_state & IP_VS_STATE_MASTER)&&(((cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED)&&(pkts % sysctl_ip_vs_sync_threshold[1]== sysctl_ip_vs_sync_threshold[0]))||((cp->protocol == IPPROTO_TCP)&&(cp->old_state != cp->state)&&((cp->state == IP_VS_TCP_S_FIN_WAIT)||(cp->state == IP_VS_TCP_S_CLOSE_WAIT)||(cp->state == IP_VS_TCP_S_TIME_WAIT)))))ip_vs_sync_conn(cp);
cp->old_state = cp->state;ip_vs_conn_put(cp);return ret;}
staticinttcp_conn_schedule(int af,struct sk_buff *skb,struct ip_vs_protocol *pp,int*verdict,struct ip_vs_conn **cpp){struct ip_vs_service *svc;struct tcphdr _tcph,*th;struct ip_vs_iphdr iph;ip_vs_fill_iphdr(af,skb_network_header(skb),&iph);
th =skb_header_pointer(skb, iph.len,sizeof(_tcph),&_tcph);if(th ==NULL){*verdict = NF_DROP;return0;}/*
* Syn-proxy step 2 logic: receive client's
* 3-handshake Ack packet
*/if(ip_vs_synproxy_ack_rcv(af, skb, th, pp, cpp,&iph, verdict)==0){return0;}if(th->syn &&!th->ack &&!th->fin &&!th->rst &&(svc =ip_vs_service_get(af, skb->mark, iph.protocol,&iph.daddr,
th->dest))){// proto、vip、vport相同if(ip_vs_todrop()){/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/ip_vs_service_put(svc);*verdict = NF_DROP;return0;}/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/*cpp =ip_vs_schedule(svc, skb,0);if(!*cpp){*verdict =ip_vs_leave(svc, skb, pp);return0;}ip_vs_service_put(svc);return1;}/* drop tcp packet which send to vip and !vport */if(sysctl_ip_vs_tcp_drop_entry &&(svc =ip_vs_lookup_vip(af, iph.protocol,&iph.daddr))){IP_VS_INC_ESTATS(ip_vs_esmib, DEFENCE_TCP_DROP);*verdict = NF_DROP;return0;}return1;}
ip_vs_schedule()
struct ip_vs_conn *ip_vs_schedule(struct ip_vs_service *svc,struct sk_buff *skb,int is_synproxy_on){struct ip_vs_conn *cp =NULL;struct ip_vs_iphdr iph;struct ip_vs_dest *dest;
__be16 _ports[2],*pptr;ip_vs_fill_iphdr(svc->af,skb_network_header(skb),&iph);
pptr =skb_header_pointer(skb, iph.len,sizeof(_ports), _ports);if(pptr ==NULL)returnNULL;/*
* Persistent service
*/if(svc->flags & IP_VS_SVC_F_PERSISTENT)returnip_vs_sched_persist(svc, skb, pptr, is_synproxy_on);/*
* Non-persistent service
*/if(!svc->fwmark && pptr[1]!= svc->port){if(!svc->port)pr_err("Schedule: port zero only supported ""in persistent services, ""check your ipvs configuration\n");returnNULL;}
dest = svc->scheduler->schedule(svc, skb);// SH为ip_vs_sh_schedule()if(dest ==NULL){IP_VS_DBG(1,"Schedule: no dest found.\n");returnNULL;}/*
* Create a connection entry.
*/
cp =ip_vs_conn_new(svc->af, iph.protocol,&iph.saddr, pptr[0],&iph.daddr, pptr[1],&dest->addr, dest->port ? dest->port : pptr[1],ip_vs_onepacket_enabled(svc,&iph),
dest, skb, is_synproxy_on);if(cp ==NULL)returnNULL;IP_VS_DBG_BUF(6,"Schedule fwd:%c c:%s:%u v:%s:%u ""d:%s:%u conn->flags:%X conn->refcnt:%d\n",ip_vs_fwd_tag(cp),IP_VS_DBG_ADDR(svc->af,&cp->caddr),ntohs(cp->cport),IP_VS_DBG_ADDR(svc->af,&cp->vaddr),ntohs(cp->vport),IP_VS_DBG_ADDR(svc->af,&cp->daddr),ntohs(cp->dport),
cp->flags,atomic_read(&cp->refcnt));ip_vs_conn_stats(cp, svc);return cp;}
ip_vs_conn_new()
struct ip_vs_conn *ip_vs_conn_new(int af,int proto,constunion nf_inet_addr *caddr, __be16 cport,constunion nf_inet_addr *vaddr, __be16 vport,constunion nf_inet_addr *daddr, __be16 dport,unsigned flags,struct ip_vs_dest *dest,struct sk_buff *skb,int is_synproxy_on){struct ip_vs_conn *cp;struct ip_vs_protocol *pp =ip_vs_proto_get(proto);struct ip_vs_conn_idx *ci_idx,*co_idx;struct tcphdr _tcph,*th;
cp =kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);if(cp ==NULL){IP_VS_ERR_RL("%s(): no memory\n",__func__);returnNULL;}/* init connection index of OUTside2INside */
ci_idx =(struct ip_vs_conn_idx *)(((__u8 *) cp)+sizeof(struct ip_vs_conn));INIT_LIST_HEAD(&ci_idx->c_list);
ci_idx->af = af;
ci_idx->protocol = proto;ip_vs_addr_copy(af,&ci_idx->s_addr, caddr);
ci_idx->s_port = cport;ip_vs_addr_copy(af,&ci_idx->d_addr, vaddr);
ci_idx->d_port = vport;
ci_idx->flags |= IP_VS_CIDX_F_OUT2IN;
ci_idx->cp = cp;/* init connection index of INside2OUTside */
co_idx =(struct ip_vs_conn_idx *)(((__u8 *) cp)+sizeof(struct ip_vs_conn)+sizeof(struct ip_vs_conn_idx));INIT_LIST_HEAD(&co_idx->c_list);
co_idx->af = af;
co_idx->protocol = proto;ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,&co_idx->s_addr, daddr);
co_idx->s_port = dport;
co_idx->flags |= IP_VS_CIDX_F_IN2OUT;
co_idx->cp = cp;/* now init connection */setup_timer(&cp->timer, ip_vs_conn_expire,(unsignedlong)cp);
cp->af = af;
cp->protocol = proto;// protoip_vs_addr_copy(af,&cp->caddr, caddr);// cip
cp->cport = cport;// cportip_vs_addr_copy(af,&cp->vaddr, vaddr);// vip
cp->vport = vport;// vport/* proto should only be IPPROTO_IP if d_addr is a fwmark */ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,&cp->daddr, daddr);// rip
cp->dport = dport;// rport
cp->flags = flags;spin_lock_init(&cp->lock);
cp->in_idx = ci_idx;
cp->out_idx = co_idx;/*
* Set the entry is referenced by the current thread before hashing
* it in the table, so that other thread run ip_vs_random_dropentry
* but cannot drop this entry.
*/atomic_set(&cp->refcnt,1);atomic_set(&cp->n_control,0);atomic_set(&cp->in_pkts,0);atomic_inc(&ip_vs_conn_count);if(flags & IP_VS_CONN_F_NO_CPORT)atomic_inc(&ip_vs_conn_no_cport_cnt);/* Bind the connection with a destination server */ip_vs_bind_dest(cp, dest);// 设置cp->dest/* Set its state and timeout */
cp->state =0;// IP_VS_TCP_S_NONE
cp->timeout =3* HZ;// 3s/* Bind its packet transmitter */#ifdef CONFIG_IP_VS_IPV6if(af == AF_INET6)ip_vs_bind_xmit_v6(cp);else#endifip_vs_bind_xmit(cp);// 对于Full-NAT,cp->packet_xmit = ip_vs_fnat_xmitif(unlikely(pp &&atomic_read(&pp->appcnt)))ip_vs_bind_app(cp, pp);/* Set syn-proxy members
* Set cp->flag manually to avoid svn->flags change when
* ack_skb is on the way
*/skb_queue_head_init(&cp->ack_skb);atomic_set(&cp->syn_retry_max,0);if(is_synproxy_on ==1&& skb !=NULL){
th =skb_header_pointer(skb,ip_hdr(skb)->ihl *4,sizeof(_tcph),&_tcph);if(th ==NULL){IP_VS_ERR_RL("%s(): get tcphdr failed\n",__func__);ip_vs_conn_del(cp);returnNULL;}/* Set syn-proxy flag */
cp->flags |= IP_VS_CONN_F_SYNPROXY;/* Save ack packet */skb_queue_tail(&cp->ack_skb, skb);/* Save ack_seq - 1 */
cp->syn_proxy_seq.init_seq =htonl((__u32)((htonl(th->ack_seq)-1)));/* Use IP_VS_TCP_S_SYN_SENT for syn */
cp->timeout = pp->timeout_table[cp->state =
IP_VS_TCP_S_SYN_SENT];}else{/* Unset syn-proxy flag */
cp->flags &=~IP_VS_CONN_F_SYNPROXY;}/*
* bind the connection with a local address
* and hash it in the ip_vs_conn_tab finally.
*/if(unlikely(ip_vs_hbind_laddr(cp)==0)){// 选择bip和bportIP_VS_ERR_RL("bind local address: no port available\n");ip_vs_conn_del(cp);returnNULL;}return cp;}
ip_vs_hbind_laddr()
staticinlineintip_vs_hbind_laddr(struct ip_vs_conn *cp){struct ip_vs_dest *dest = cp->dest;struct ip_vs_service *svc = dest->svc;struct ip_vs_laddr *local;int ret =0;int remaining, i, tport, hit =0;unsigned ihash, ohash;struct ip_vs_conn_idx *cidx;/* fwd methods: not IP_VS_CONN_F_FULLNAT */switch(IP_VS_FWD_METHOD(cp)){case IP_VS_CONN_F_MASQ:case IP_VS_CONN_F_TUNNEL:case IP_VS_CONN_F_DROUTE:case IP_VS_CONN_F_LOCALNODE:case IP_VS_CONN_F_BYPASS:ip_vs_addr_copy(cp->af,&cp->out_idx->d_addr,&cp->caddr);
cp->out_idx->d_port = cp->cport;ip_vs_addr_copy(cp->af,&cp->laddr,&cp->caddr);
cp->lport = cp->cport;
cp->local =NULL;ip_vs_conn_hash(cp);
ret =1;goto out;}if(cp->flags & IP_VS_CONN_F_TEMPLATE){ip_vs_addr_copy(cp->af,&cp->out_idx->d_addr,&cp->caddr);
cp->out_idx->d_port = cp->cport;ip_vs_addr_copy(cp->af,&cp->laddr,&cp->caddr);
cp->lport = cp->cport;
cp->local =NULL;ip_vs_conn_hash(cp);
ret =1;goto out;}/*
* fwd methods: IP_VS_CONN_F_FULLNAT
*//* choose a local address by round-robin */
local =ip_vs_get_laddr(svc);if(local !=NULL){/*OUTside2INside: hashed by client address and port, virtual address and port */
ihash =ip_vs_conn_hashkey(cp->af,&cp->caddr, cp->cport,&cp->vaddr, cp->vport);/* increase the refcnt counter of the local address */ip_vs_laddr_hold(local);ip_vs_addr_copy(cp->af,&cp->out_idx->d_addr,&local->addr);ip_vs_addr_copy(cp->af,&cp->laddr,&local->addr);
remaining = sysctl_ip_vs_lport_max - sysctl_ip_vs_lport_min +1;for(i =0; i < sysctl_ip_vs_lport_tries; i++){/* choose a port */
tport =
sysctl_ip_vs_lport_min +atomic64_inc_return(&local->port)% remaining;
cp->out_idx->d_port = cp->lport =htons(tport);/* init hit everytime before lookup the tuple */
hit =0;/*INside2OUTside: hashed by destination address and port, local address and port */
ohash =ip_vs_conn_hashkey(cp->af,&cp->daddr, cp->dport,&cp->laddr, cp->lport);/* lock the conntab bucket */ip_vs_conn_lock2(ihash, ohash);/*
* check local address and port is valid by lookup connection table
*/list_for_each_entry(cidx,&ip_vs_conn_tab[ohash],
c_list){if(cidx->af == cp->af
&&ip_vs_addr_equal(cp->af,&cp->daddr,&cidx->s_addr)&&ip_vs_addr_equal(cp->af,&cp->laddr,&cidx->d_addr)&& cp->dport == cidx->s_port
&& cp->lport == cidx->d_port
&& cp->protocol == cidx->protocol){/* HIT */atomic64_inc(&local->port_conflict);
hit =1;break;}}if(hit ==0){
cp->local = local;/* hashed */__ip_vs_conn_hash(cp, ihash, ohash);ip_vs_conn_unlock2(ihash, ohash);atomic_inc(&local->conn_counts);
ret =1;goto out;}ip_vs_conn_unlock2(ihash, ohash);}if(ret ==0){ip_vs_laddr_put(local);}}
ret =0;
out:return ret;}
ip_vs_fnat_xmit()
intip_vs_fnat_xmit(struct sk_buff *skb,struct ip_vs_conn *cp,struct ip_vs_protocol *pp){struct rtable *rt;/* Route to the other host */int mtu;struct iphdr *iph =ip_hdr(skb);EnterFunction(10);/* check if it is a connection of no-client-port */if(unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)){
__be16 _pt,*p;
p =skb_header_pointer(skb, iph->ihl *4,sizeof(_pt),&_pt);if(p ==NULL)goto tx_error;ip_vs_conn_fill_cport(cp,*p);IP_VS_DBG(10,"filled cport=%d\n",ntohs(*p));}if(!(rt =__ip_vs_get_out_rt(cp,RT_TOS(iph->tos))))goto tx_error_icmp;/* MTU checking */
mtu =dst_mtu(&rt->u.dst);if((skb->len > mtu)&&(iph->frag_off &htons(IP_DF))){ip_rt_put(rt);icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,htonl(mtu));IP_VS_DBG_RL_PKT(0, pp, skb,0,"ip_vs_nat_xmit(): frag needed for");goto tx_error;}/* copy-on-write the packet before mangling it */if(!skb_make_writable(skb,sizeof(struct iphdr)))goto tx_error_put;if(skb_cow(skb, rt->u.dst.dev->hard_header_len))goto tx_error_put;/* drop old route */skb_dst_drop(skb);skb_dst_set(skb,&rt->u.dst);/* mangle the packet */if(pp->fnat_in_handler &&!pp->fnat_in_handler(&skb, pp, cp))// TCP为tcp_fnat_in_handler()goto tx_error;ip_hdr(skb)->saddr = cp->laddr.ip;ip_hdr(skb)->daddr = cp->daddr.ip;ip_send_check(ip_hdr(skb));IP_VS_DBG_PKT(10, pp, skb,0,"After FNAT-IN");/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. *//* Another hack: avoid icmp_send in ip_fragment */
skb->local_df =1;IP_VS_XMIT(PF_INET, skb, rt);// IP_VS_XMIT -> NF_HOOK -> dst_output()LeaveFunction(10);return NF_STOLEN;
tx_error_icmp:dst_link_failure(skb);
tx_error:LeaveFunction(10);kfree_skb(skb);return NF_STOLEN;
tx_error_put:ip_rt_put(rt);goto tx_error;}