本文分析cilium native routing模式下的报文路径和涉及到的ebpf源码分析。实验环境在vxlan模式下修改参数后而来。native routing和vxlan模式的区别主要是跨节点通信时,vxlan模式需要封装,native routing模式需要根据目的ip查找路由表转发到其他节点。
参数修改
通过修改cilium configmap的如下几个参数后,删除旧的cilium pod后即可修改为native routing模式
auto-direct-node-routes: "true"
ipv4-native-routing-cidr: 10.0.0.0/8
tunnel: disabled
拓扑图如下

流程分析
本文只分析跨host两个pod之间的通信过程。
从pod1发送的arp请求报文
报文路径
从pod1发送的arp请求报文,会被lxc1的ebpf程序处理,将lxc1的mac回复给pod1,报文路径如下
eth0(pod1) -> lxc1(tc ingress:from-container) -> eth0(pod1)
ebpf程序分析
//tc ingress:from-container
handle_xgress
tail_handle_arp
arp_respond(ctx, &mac, tip, &smac, sip, 0);
arp_prepare_response(ctx, smac, sip, dmac, tip);
//direction为0,表示ingress
//将报文重定向到接口 ctx_get_ifindex(ctx),即接收报文的接口
ctx_redirect(ctx, ctx_get_ifindex(ctx), direction);
从pod1发送的ip报文路径如下
报文路径
eth0(pod1) -> lxc1(tc ingress:from-container) -> route table -> enp0s8(master)
eth0(pod1):
//pod内部发出的报文,最终会调用veth_xmit发出
//根据veth原理,会将报文的dev改成对端dev(即lxc)后调用netif_rx走host协议栈
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct net_device *rcv;
rcv = rcu_dereference(priv->peer);
veth_forward_skb(rcv, skb, rq, rcv_xdp)
netif_rx(skb)
lxc1(tc ingress:from-container):
//在协议栈入口调用sch_handle_ingress执行ebpf程序,ebpf程序会执行egress policy,如果允许通过返回CTX_ACT_OK
//ebpf程序处理流程后面会详细介绍
__netif_receive_skb_core
...
if (static_branch_unlikely(&ingress_needed_key)) {
bool another = false;
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another);
//执行ebpf程序
switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
//TC_ACT_OK,则退出继续执行协议栈流程
case TC_ACT_OK:
...
}
}
route table:
查找路由表(10.0.1.0/24 via 192.168.56.3 dev enp0s8),获取出接口为enp0s8
ebpf程序分析
最重要的是lxc1(tc ingress:from-container)的ebpf程序
handle_xgress
send_trace_notify(ctx, TRACE_FROM_LXC, SECLABEL, 0, 0, 0,
TRACE_REASON_UNKNOWN, TRACE_PAYLOAD_LEN);
ep_tail_call(ctx, CILIUM_CALL_IPV4_FROM_LXC); //tail_handle_ipv4
__tail_handle_ipv4
tail_handle_ipv4_cont
handle_ipv4_from_lxc
//根据目的ip到cilium_ipcache map中查找,cilium_ipcache中保存的是整个集群所有的pod信息
info = lookup_ip4_remote_endpoint(ip4->daddr);
ipcache_lookup4(&IPCACHE_MAP, addr, V4_CACHE_KEY_LEN)
if (info && info->sec_label) {
//主要用在egress policy中,检查是否允许访问此业务
*dst_id = info->sec_label;
tunnel_endpoint = info->tunnel_endpoint;
encrypt_key = get_min_encrypt_key(info->key);
} else {
*dst_id = WORLD_ID;
}
//执行egress policy
verdict = policy_can_egress4(ctx, tuple, SECLABEL, *dst_id, &policy_match_type, &audited);
...
//ttl减一
ipv4_l3(ctx, ETH_HLEN, NULL, (__u8 *)&router_mac.addr, ip4)
send_trace_notify(ctx, TRACE_TO_STACK, SECLABEL, *dst_id, 0, 0,
trace.reason, trace.monitor);
cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, 0);
//返回ok,继续走host协议栈处理
return CTX_ACT_OK;
报文到达node1后的路径如下
报文路径
enp0s8(node1) -> route table -> cilium_host(tc egress:from-host) -> lxc1 -> eth0(pod2)
enp0s8(node1):
网卡接收到报文后,走host协议栈处理流程
route table:
查找host路由表,获取出接口为cilium_host,调用__dev_queue_xmit从cilium_host接口发出
cilium_host(tc egress:from-host):
//调用__dev_queue_xmit时,执行cilium_host接口上挂载的ebpf程序
//ebpf程序会根据目的ip判断是否为本地pod,
//ebpf程序处理流程后面会详细介绍
__dev_queue_xmit
sch_handle_egress(skb, &rc, dev);
switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
...
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
return NULL;
default:
break;
}
ebpf程序分析
//ebpf程序 cilium_host(tc egress:from-host)
__section("from-host")
int from_host(struct __ctx_buff *ctx)
/* Traffic from the host ns going through cilium_host device must
* not be subject to EDT rate-limiting.
*/
edt_set_aggregate(ctx, 0);
return handle_netdev(ctx, true);
return do_netdev(ctx, proto, from_host);
if (from_host) {
enum trace_point trace = TRACE_FROM_HOST;
//上报TRACE_FROM_HOST事件
send_trace_notify(ctx, trace, identity, 0, 0,
ctx->ingress_ifindex,
TRACE_REASON_UNKNOWN, TRACE_PAYLOAD_LEN);
} else {
...
}
if (from_host) {
//跳转到 tail_handle_ipv4_from_host
ep_tail_call(ctx, CILIUM_CALL_IPV4_FROM_HOST);
} else {
...
}
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_FROM_HOST)
int tail_handle_ipv4_from_host(struct __ctx_buff *ctx)
tail_handle_ipv4(ctx, ipcache_srcid, true);
handle_ipv4(ctx, proxy_identity, ipcache_srcid, from_host);
//查找目的ip是否为本host上的pod
/* Lookup IPv4 address in list of local endpoints and host IPs */
ep = lookup_ip4_endpoint(ip4);
if (ep) {
return ipv4_local_delivery(ctx, ETH_HLEN, secctx, ip4, ep, METRIC_INGRESS, from_host);
}
static __always_inline int ipv4_local_delivery(struct __ctx_buff *ctx, int l3_off,
__u32 seclabel, struct iphdr *ip4,
const struct endpoint_info *ep,
__u8 direction __maybe_unused,
bool from_host __maybe_unused)
{
mac_t router_mac = ep->node_mac;
mac_t lxc_mac = ep->mac;
int ret;
//ttl减一,源mac改成ep->node_mac,目的mac改成ep->mac
ret = ipv4_l3(ctx, l3_off, (__u8 *) &router_mac, (__u8 *) &lxc_mac, ip4);
/* Jumps to destination pod's BPF program to enforce ingress policies. */
ctx_store_meta(ctx, CB_SRC_LABEL, seclabel);
ctx_store_meta(ctx, CB_IFINDEX, ep->ifindex);
ctx_store_meta(ctx, CB_FROM_HOST, from_host ? 1 : 0);
//跳转到处理policy的ebpf程序
tail_call_dynamic(ctx, &POLICY_CALL_MAP, ep->lxc_id);
return DROP_MISSED_TAIL_CALL;
}
__section_tail(CILIUM_MAP_POLICY, TEMPLATE_LXC_ID)
int handle_policy(struct __ctx_buff *ctx)
handle_policy 最终调用到 tail_ipv4_policy
int tail_ipv4_policy(struct __ctx_buff *ctx)
{
struct ipv4_ct_tuple tuple = {};
int ret, ifindex = ctx_load_meta(ctx, CB_IFINDEX);
__u32 src_label = ctx_load_meta(ctx, CB_SRC_LABEL);
bool from_host = ctx_load_meta(ctx, CB_FROM_HOST);
bool proxy_redirect __maybe_unused = false;
enum ct_status ct_status = 0;
__u16 proxy_port = 0;
ctx_store_meta(ctx, CB_SRC_LABEL, 0);
ctx_store_meta(ctx, CB_FROM_HOST, 0);
ret = ipv4_policy(ctx, ifindex, src_label, &ct_status, &tuple, &proxy_port, from_host);
/* Store meta: essential for proxy ingress, see bpf_host.c */
ctx_store_meta(ctx, CB_PROXY_MAGIC, ctx->mark);
return ret;
}
执行ingress policy,如果允许通过则将报文重定向到pod的lxc或者peer口
static __always_inline int
ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, enum ct_status *ct_status,
struct ipv4_ct_tuple *tuple_out, __u16 *proxy_port,
bool from_host __maybe_unused)
//执行ingress policy
verdict = policy_can_access_ingress(ctx, src_label, SECLABEL,
tuple->dport, tuple->nexthdr,
is_untracked_fragment,
&policy_match_type, &audited);
//取出endpoint的ifindex
ifindex = ctx_load_meta(ctx, CB_IFINDEX);
if (ifindex)
//重定向到网卡
return redirect_ep(ctx, ifindex, from_host);
redirect_ep将报文重定向到lxc接口或者lxc的peer接口,这主要取决于ENABLE_HOST_ROUTING是否被定义。
如果ENABLE_HOST_ROUTING被定义了说明是host routing模式,可将报文直接重定向到lxc的peer接口,即pod内部的eth0
static __always_inline int redirect_ep(struct __ctx_buff *ctx __maybe_unused,
int ifindex __maybe_unused,
bool needs_backlog __maybe_unused)
{
/* Going via CPU backlog queue (aka needs_backlog) is required
* whenever we cannot do a fast ingress -> ingress switch but
* instead need an ingress -> egress netns traversal or vice
* versa.
*/
if (needs_backlog || !is_defined(ENABLE_HOST_ROUTING)) {
return ctx_redirect(ctx, ifindex, 0);
} else {
...
return ctx_redirect_peer(ctx, ifindex, 0);
}
}
1067

被折叠的 条评论
为什么被折叠?



