cilium native-routing模式流程分析

本文分析cilium native routing模式下的报文路径和涉及到的ebpf源码分析。实验环境在vxlan模式下修改参数后而来。native routing和vxlan模式的区别主要是跨节点通信时,vxlan模式需要封装,native routing模式需要根据目的ip查找路由表转发到其他节点。

参数修改

通过修改cilium configmap的如下几个参数后,删除旧的cilium pod后即可修改为native routing模式

auto-direct-node-routes: "true"
ipv4-native-routing-cidr: 10.0.0.0/8
tunnel: disabled

拓扑图如下

在这里插入图片描述

流程分析

本文只分析跨host两个pod之间的通信过程。

从pod1发送的arp请求报文

报文路径
从pod1发送的arp请求报文,会被lxc1的ebpf程序处理,将lxc1的mac回复给pod1,报文路径如下
eth0(pod1) -> lxc1(tc ingress:from-container) -> eth0(pod1)

ebpf程序分析

//tc ingress:from-container
handle_xgress
	tail_handle_arp
		arp_respond(ctx, &mac, tip, &smac, sip, 0);
			arp_prepare_response(ctx, smac, sip, dmac, tip);
			//direction为0,表示ingress
			//将报文重定向到接口 ctx_get_ifindex(ctx),即接收报文的接口
			ctx_redirect(ctx, ctx_get_ifindex(ctx), direction);
从pod1发送的ip报文路径如下

报文路径
eth0(pod1) -> lxc1(tc ingress:from-container) -> route table -> enp0s8(master)

eth0(pod1):

	//pod内部发出的报文,最终会调用veth_xmit发出
	//根据veth原理,会将报文的dev改成对端dev(即lxc)后调用netif_rx走host协议栈
	static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
		struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
		struct net_device *rcv;
		rcv = rcu_dereference(priv->peer);
		veth_forward_skb(rcv, skb, rq, rcv_xdp)
			netif_rx(skb)

lxc1(tc ingress:from-container):

	//在协议栈入口调用sch_handle_ingress执行ebpf程序,ebpf程序会执行egress policy,如果允许通过返回CTX_ACT_OK
	//ebpf程序处理流程后面会详细介绍
	__netif_receive_skb_core
		...
		if (static_branch_unlikely(&ingress_needed_key)) {
			bool another = false;
			skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another);
				//执行ebpf程序
				switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
				//TC_ACT_OK,则退出继续执行协议栈流程
				case TC_ACT_OK:
				...
				}
		}

route table:

	查找路由表(10.0.1.0/24 via 192.168.56.3 dev enp0s8),获取出接口为enp0s8

ebpf程序分析
最重要的是lxc1(tc ingress:from-container)的ebpf程序

handle_xgress
	send_trace_notify(ctx, TRACE_FROM_LXC, SECLABEL, 0, 0, 0,
			  TRACE_REASON_UNKNOWN, TRACE_PAYLOAD_LEN);

	ep_tail_call(ctx, CILIUM_CALL_IPV4_FROM_LXC); //tail_handle_ipv4
		__tail_handle_ipv4
			tail_handle_ipv4_cont
				handle_ipv4_from_lxc
					//根据目的ip到cilium_ipcache map中查找,cilium_ipcache中保存的是整个集群所有的pod信息
					info = lookup_ip4_remote_endpoint(ip4->daddr);
						ipcache_lookup4(&IPCACHE_MAP, addr, V4_CACHE_KEY_LEN)
					if (info && info->sec_label) {
						//主要用在egress policy中,检查是否允许访问此业务
						*dst_id = info->sec_label;
						tunnel_endpoint = info->tunnel_endpoint;
						encrypt_key = get_min_encrypt_key(info->key);
					} else {
						*dst_id = WORLD_ID;
					}
					//执行egress policy
					verdict = policy_can_egress4(ctx, tuple, SECLABEL, *dst_id, &policy_match_type, &audited);
					...
					//ttl减一
					ipv4_l3(ctx, ETH_HLEN, NULL, (__u8 *)&router_mac.addr, ip4)

					send_trace_notify(ctx, TRACE_TO_STACK, SECLABEL, *dst_id, 0, 0,
							  trace.reason, trace.monitor);
					cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, 0);
					
					//返回ok,继续走host协议栈处理
					return CTX_ACT_OK;
报文到达node1后的路径如下

报文路径
enp0s8(node1) -> route table -> cilium_host(tc egress:from-host) -> lxc1 -> eth0(pod2)

enp0s8(node1):

	网卡接收到报文后,走host协议栈处理流程

route table:

	查找host路由表,获取出接口为cilium_host,调用__dev_queue_xmit从cilium_host接口发出

cilium_host(tc egress:from-host):

	//调用__dev_queue_xmit时,执行cilium_host接口上挂载的ebpf程序
	//ebpf程序会根据目的ip判断是否为本地pod,
	//ebpf程序处理流程后面会详细介绍
	__dev_queue_xmit
		sch_handle_egress(skb, &rc, dev);
			switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
			...
			case TC_ACT_REDIRECT:
				/* No need to push/pop skb's mac_header here on egress! */
				skb_do_redirect(skb);
				*ret = NET_XMIT_SUCCESS;
				return NULL;
			default:
				break;
			}

ebpf程序分析

//ebpf程序 cilium_host(tc egress:from-host)
__section("from-host")
int from_host(struct __ctx_buff *ctx)
	/* Traffic from the host ns going through cilium_host device must
	 * not be subject to EDT rate-limiting.
	 */
	edt_set_aggregate(ctx, 0);
	return handle_netdev(ctx, true);
		return do_netdev(ctx, proto, from_host);
			if (from_host) {
				enum trace_point trace = TRACE_FROM_HOST;
				//上报TRACE_FROM_HOST事件
				send_trace_notify(ctx, trace, identity, 0, 0,
						  ctx->ingress_ifindex,
						  TRACE_REASON_UNKNOWN, TRACE_PAYLOAD_LEN);
			} else {
				...
			}

			if (from_host) {
				//跳转到 tail_handle_ipv4_from_host
				ep_tail_call(ctx, CILIUM_CALL_IPV4_FROM_HOST);
			} else {
				...
			}

__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_FROM_HOST)
int tail_handle_ipv4_from_host(struct __ctx_buff *ctx)
	tail_handle_ipv4(ctx, ipcache_srcid, true);
		handle_ipv4(ctx, proxy_identity, ipcache_srcid, from_host);
			//查找目的ip是否为本host上的pod
			/* Lookup IPv4 address in list of local endpoints and host IPs */
			ep = lookup_ip4_endpoint(ip4);
			if (ep) {
				return ipv4_local_delivery(ctx, ETH_HLEN, secctx, ip4, ep, METRIC_INGRESS, from_host);
			}

static __always_inline int ipv4_local_delivery(struct __ctx_buff *ctx, int l3_off,
					       __u32 seclabel, struct iphdr *ip4,
					       const struct endpoint_info *ep,
					       __u8 direction __maybe_unused,
					       bool from_host __maybe_unused)
{
	mac_t router_mac = ep->node_mac;
	mac_t lxc_mac = ep->mac;
	int ret;

	//ttl减一,源mac改成ep->node_mac,目的mac改成ep->mac
	ret = ipv4_l3(ctx, l3_off, (__u8 *) &router_mac, (__u8 *) &lxc_mac, ip4);

	/* Jumps to destination pod's BPF program to enforce ingress policies. */
	ctx_store_meta(ctx, CB_SRC_LABEL, seclabel);
	ctx_store_meta(ctx, CB_IFINDEX, ep->ifindex);
	ctx_store_meta(ctx, CB_FROM_HOST, from_host ? 1 : 0);

	//跳转到处理policy的ebpf程序
	tail_call_dynamic(ctx, &POLICY_CALL_MAP, ep->lxc_id);
	return DROP_MISSED_TAIL_CALL;
}

__section_tail(CILIUM_MAP_POLICY, TEMPLATE_LXC_ID)
int handle_policy(struct __ctx_buff *ctx)
	handle_policy 最终调用到 tail_ipv4_policy
	
int tail_ipv4_policy(struct __ctx_buff *ctx)
{
	struct ipv4_ct_tuple tuple = {};
	int ret, ifindex = ctx_load_meta(ctx, CB_IFINDEX);
	__u32 src_label = ctx_load_meta(ctx, CB_SRC_LABEL);
	bool from_host = ctx_load_meta(ctx, CB_FROM_HOST);
	bool proxy_redirect __maybe_unused = false;
	enum ct_status ct_status = 0;
	__u16 proxy_port = 0;

	ctx_store_meta(ctx, CB_SRC_LABEL, 0);
	ctx_store_meta(ctx, CB_FROM_HOST, 0);

	ret = ipv4_policy(ctx, ifindex, src_label, &ct_status, &tuple, &proxy_port, from_host);

	/* Store meta: essential for proxy ingress, see bpf_host.c */
	ctx_store_meta(ctx, CB_PROXY_MAGIC, ctx->mark);

	return ret;
}

执行ingress policy,如果允许通过则将报文重定向到pod的lxc或者peer口

static __always_inline int
ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, enum ct_status *ct_status,
	    struct ipv4_ct_tuple *tuple_out, __u16 *proxy_port,
	    bool from_host __maybe_unused)
	//执行ingress policy
	verdict = policy_can_access_ingress(ctx, src_label, SECLABEL,
					    tuple->dport, tuple->nexthdr,
					    is_untracked_fragment,
					    &policy_match_type, &audited);
	//取出endpoint的ifindex
	ifindex = ctx_load_meta(ctx, CB_IFINDEX);
	if (ifindex)
		//重定向到网卡
		return redirect_ep(ctx, ifindex, from_host);

redirect_ep将报文重定向到lxc接口或者lxc的peer接口,这主要取决于ENABLE_HOST_ROUTING是否被定义。
如果ENABLE_HOST_ROUTING被定义了说明是host routing模式,可将报文直接重定向到lxc的peer接口,即pod内部的eth0

static __always_inline int redirect_ep(struct __ctx_buff *ctx __maybe_unused,
				       int ifindex __maybe_unused,
				       bool needs_backlog __maybe_unused)
{
	/* Going via CPU backlog queue (aka needs_backlog) is required
	 * whenever we cannot do a fast ingress -> ingress switch but
	 * instead need an ingress -> egress netns traversal or vice
	 * versa.
	 */
	if (needs_backlog || !is_defined(ENABLE_HOST_ROUTING)) {
		return ctx_redirect(ctx, ifindex, 0);
	} else {
		...
		return ctx_redirect_peer(ctx, ifindex, 0);
	}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

分享放大价值

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值