linux 路由表之ifconfig

1. 前言

     本博客主要分析在应用层使用ifconfig命令时所引发的系统调用、及内核路由表中的地址添加流程。

     FIB: forward information base, fib_info结构体

2. ifconfig命令

在应用层使用命令:ifconfig eth0 192.168.1.100 up 配置本地ip地址时将调用busybox源码中的ifconfig.c,具体流程如下

int ifconfig_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
int ifconfig_main(int argc UNUSED_PARAM, char **argv)
{
	struct ifreq ifr;
	struct sockaddr_in sai;
#if ENABLE_FEATURE_IFCONFIG_HW
	struct sockaddr sa;
#endif
	const struct arg1opt *a1op;
	const struct options *op;
	int sockfd;			/* socket fd we use to manipulate stuff with */
	int selector;
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
	unsigned int mask;
	unsigned int did_flags;
	unsigned int sai_hostname, sai_netmask;
#else
	unsigned char mask;
	unsigned char did_flags;
#endif
	char *p;
	/*char host[128];*/
	const char *host = NULL; /* make gcc happy */

	did_flags = 0;
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
	sai_hostname = 0;
	sai_netmask = 0;
#endif

	//ifconfig eth0 192.168.1.100 up

	/* skip argv[0] */
	++argv; //跳过 ifconfig

#if ENABLE_FEATURE_IFCONFIG_STATUS
	//表示的命令是: ifconfig -a,下面的argv[0]=-a 表示列举所有的接口设备(eth0 usb0)
	if (argv[0] && (argv[0][0] == '-' && argv[0][1] == 'a' && !argv[0][2])) { 
		interface_opt_a = 1;
		++argv;
	}
#endif

	if (!argv[0] || !argv[1]) { /* one or no args */
#if ENABLE_FEATURE_IFCONFIG_STATUS
		return display_interfaces(argv[0] /* can be NULL */); //执行ifconfig 命令显示
#else
		bb_error_msg_and_die("no support for status display");
#endif
	}

	/* Create a channel to the NET kernel. */
	sockfd = xsocket(AF_INET, SOCK_DGRAM, 0);

	//实际配置ip地址时:ifconfig eth0 192.168.1.100 up
	//所以:
	//ifr.ifr_name = eth0 
	/* get interface name */
	strncpy_IFNAMSIZ(ifr.ifr_name, *argv); 

	/* Process the remaining arguments. */
	while (*++argv != (char *) NULL) { //这里就是指向ip:192.168.1.100
		p = *argv; //这里就是指向ip:192.168.1.100
		mask = N_MASK;
		if (*p == '-') {	/* If the arg starts with '-'... */
			++p;		/*    advance past it and */
			mask = M_MASK;	/*    set the appropriate mask. */
		}

		//要理解 OptArray 的内容需查看ifconfig --help
		
		for (op = OptArray; op->name; op++) {	/* Find table entry. */
			if (strcmp(p, op->name) == 0) {	/* If name matches... */
				mask &= op->flags;
				if (mask)	/* set the mask and go. */
					goto FOUND_ARG;
				/* If we get here, there was a valid arg with an */
				/* invalid '-' prefix. */
				bb_error_msg_and_die("bad: '%s'", p-1);
			}
		}

		/* We fell through, so treat as possible hostname. */
		a1op = Arg1Opt + ARRAY_SIZE(Arg1Opt) - 1;
		mask = op->arg_flags;
		goto HOSTNAME; //上面不匹配参数,就表示是IP地址了,直接跳转到hostname(主机地址)

 FOUND_ARG:
		if (mask & ARG_MASK) {
			mask = op->arg_flags;
			a1op = Arg1Opt + (op - OptArray);
			if (mask & A_NETMASK & did_flags)
				bb_show_usage();
			if (*++argv == NULL) {
				if (mask & A_ARG_REQ)
					bb_show_usage();
				--argv;
				mask &= A_SET_AFTER;	/* just for broadcast */
			} else {	/* got an arg so process it */
 HOSTNAME:
				did_flags |= (mask & (A_NETMASK|A_HOSTNAME));
				if (mask & A_CAST_HOST_COPY) {
#if ENABLE_FEATURE_IFCONFIG_HW //mac地址配置
					if (mask & A_CAST_RESOLVE) {
#endif
#if ENABLE_FEATURE_IPV6 //IPV6
						char *prefix;
						int prefix_len = 0;
#endif
						/*safe_strncpy(host, *argv, (sizeof host));*/
						host = *argv; //ip地址: host=192.168.1.100
#if ENABLE_FEATURE_IPV6
						prefix = strchr(host, '/');
						if (prefix) {
							prefix_len = xatou_range(prefix + 1, 0, 128);
							*prefix = '\0';
						}
#endif
						sai.sin_family = AF_INET;
						sai.sin_port = 0;
						if (strcmp(host, "default") == 0) { //地址为默认0.0.0.0
							/* Default is special, meaning 0.0.0.0. */
							sai.sin_addr.s_addr = INADDR_ANY;
						}
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
						else if ((host[0] == '+' && !host[1]) && (mask & A_BROADCAST)
						 && (did_flags & (A_NETMASK|A_HOSTNAME)) == (A_NETMASK|A_HOSTNAME)
						) {
							/* + is special, meaning broadcast is derived. */
							sai.sin_addr.s_addr = (~sai_netmask) | (sai_hostname & sai_netmask);
						}
#endif
						else {
							len_and_sockaddr *lsa;
							if (strcmp(host, "inet") == 0)
								continue; /* compat stuff */
							lsa = xhost2sockaddr(host, 0); //配置主机地址
#if ENABLE_FEATURE_IPV6
							if (lsa->u.sa.sa_family == AF_INET6) {
								int sockfd6;
								struct in6_ifreq ifr6;

								memcpy((char *) &ifr6.ifr6_addr,
										(char *) &(lsa->u.sin6.sin6_addr),
										sizeof(struct in6_addr));

								/* Create a channel to the NET kernel. */
								sockfd6 = xsocket(AF_INET6, SOCK_DGRAM, 0);
								xioctl(sockfd6, SIOGIFINDEX, &ifr);
								ifr6.ifr6_ifindex = ifr.ifr_ifindex;
								ifr6.ifr6_prefixlen = prefix_len;
								ioctl_or_perror_and_die(sockfd6, a1op->selector, &ifr6, "SIOC%s", a1op->name);
								if (ENABLE_FEATURE_CLEAN_UP)
									free(lsa);
								continue;
							}
#endif
							sai.sin_addr = lsa->u.sin.sin_addr;
							if (ENABLE_FEATURE_CLEAN_UP)
								free(lsa);
						}
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
						if (mask & A_HOSTNAME)
							sai_hostname = sai.sin_addr.s_addr;
						if (mask & A_NETMASK)
							sai_netmask = sai.sin_addr.s_addr;
#endif
						p = (char *) &sai;
#if ENABLE_FEATURE_IFCONFIG_HW
					} else {	/* A_CAST_HOST_COPY_IN_ETHER */
						/* This is the "hw" arg case. */
						smalluint hw_class= index_in_substrings("ether\0"
								IF_FEATURE_HWIB("infiniband\0"), *argv) + 1;
						if (!hw_class || !*++argv)
							bb_show_usage();
						/*safe_strncpy(host, *argv, sizeof(host));*/
						host = *argv;
						if (hw_class == 1 ? in_ether(host, &sa) : in_ib(host, &sa))
							bb_error_msg_and_die("invalid hw-addr %s", host);
						p = (char *) &sa;
					}
#endif
					memcpy( (((char *)&ifr) + a1op->ifr_offset),
						   p, sizeof(struct sockaddr));
				} else {
					/* FIXME: error check?? */
					unsigned long i = strtoul(*argv, NULL, 0);
					p = ((char *)&ifr) + a1op->ifr_offset;
#if ENABLE_FEATURE_IFCONFIG_MEMSTART_IOADDR_IRQ
					if (mask & A_MAP_TYPE) {
						xioctl(sockfd, SIOCGIFMAP, &ifr);
						if ((mask & A_MAP_UCHAR) == A_MAP_UCHAR)
							*((unsigned char *) p) = i;
						else if (mask & A_MAP_USHORT)
							*((unsigned short *) p) = i;
						else
							*((unsigned long *) p) = i;
					} else
#endif
					if (mask & A_CAST_CHAR_PTR)
						*((caddr_t *) p) = (caddr_t) i;
					else	/* A_CAST_INT */
						*((int *) p) = i;
				}

				ioctl_or_perror_and_die(sockfd, a1op->selector, &ifr, "SIOC%s", a1op->name);  //最后调用这里设置IP地址
#ifdef QUESTIONABLE_ALIAS_CASE
				if (mask & A_COLON_CHK) {
					/*
					 * Don't do the set_flag() if the address is an alias with
					 * a '-' at the end, since it's deleted already! - Roman
					 *
					 * Should really use regex.h here, not sure though how well
					 * it'll go with the cross-platform support etc.
					 */
					char *ptr;
					short int found_colon = 0;
					for (ptr = ifr.ifr_name; *ptr; ptr++)
						if (*ptr == ':')
							found_colon++;
					if (found_colon && ptr[-1] == '-')
						continue;
				}
#endif
			}
			if (!(mask & A_SET_AFTER))
				continue;
			mask = N_SET;
		}

		xioctl(sockfd, SIOCGIFFLAGS, &ifr);
		selector = op->selector;
		if (mask & SET_MASK)
			ifr.ifr_flags |= selector;
		else
			ifr.ifr_flags &= ~selector;
		xioctl(sockfd, SIOCSIFFLAGS, &ifr);
	} /* while () */

	if (ENABLE_FEATURE_CLEAN_UP)
		close(sockfd);
	return 0;
}

假设在应用层通过ifconfig配置IP地址,命令为:ifconfig eth0 192.168.1.100 up,那么在ifconfig_main()函数内部将通过形参将argv={"ifconfig", "eth0", "192.168.1.100", "up"}传入,然后ifconfig_main()完成对该二维数组的解析,最后调用如下函数

ioctl_or_perror_and_die(sockfd, a1op->selector, &ifr, "SIOC%s", a1op->name);  //最后调用这里设置IP地址

其中a1op->selector=SIOCSIFADDR

static const struct arg1opt Arg1Opt[] = {
    //...
	{ "SIFADDR",    SIOCSIFADDR,    ifreq_offsetof(ifr_addr) },
};

其中ioctl_or_perror_and_die()函数如下

ioctl_or_perror_and_die(sockfd, a1op->selector, &ifr, "SIOC%s", a1op->name);  //最后调用这里设置IP地址

int FAST_FUNC ioctl_or_perror_and_die(int fd, unsigned request, void *argp, const char *fmt,...)
{
	int ret;
	va_list p;

	ret = ioctl(fd, request, argp); //系统调用
	if (ret < 0) {
		va_start(p, fmt);
		bb_verror_msg(fmt, p, strerror(errno));
		/* xfunc_die can actually longjmp, so be nice */
		va_end(p);
		xfunc_die();
	}
	return ret;
}

所以最终调用ioctl、request=SIOCSIFADDR,传入内核系统调用inet_ioctl()函数

3. ifconfig在内核部分的调用流程分析

在上面的xsocket中打开的协议类型为SOCK_DGRAM(即流套接字),所以会调用如下结构体中的inet_ioctl()函数

const struct proto_ops inet_stream_ops = {
	.family		   = PF_INET,
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = inet_bind,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = inet_accept,
	.getname	   = inet_getname,
	.poll		   = tcp_poll,
	.ioctl		   = inet_ioctl, //对应busybox 内部xioctl的系统调用接口
	.listen		   = inet_listen,
	.shutdown	   = inet_shutdown,
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = inet_sendmsg,
	.recvmsg	   = inet_recvmsg,
	.mmap		   = sock_no_mmap,
	.sendpage	   = inet_sendpage,
	.splice_read	   = tcp_splice_read,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_sock_common_setsockopt,
	.compat_getsockopt = compat_sock_common_getsockopt,
	.compat_ioctl	   = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);

inet_ioctl源码

int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
	struct sock *sk = sock->sk;
	int err = 0;
	struct net *net = sock_net(sk);

	switch (cmd) {
	case SIOCGSTAMP:
		err = sock_get_timestamp(sk, (struct timeval __user *)arg);
		break;
	case SIOCGSTAMPNS:
		err = sock_get_timestampns(sk, (struct timespec __user *)arg);
		break;
	case SIOCADDRT:
	case SIOCDELRT:
	case SIOCRTMSG:
		err = ip_rt_ioctl(net, cmd, (void __user *)arg);
		break;
	case SIOCDARP:
	case SIOCGARP:
	case SIOCSARP:
		err = arp_ioctl(net, cmd, (void __user *)arg);
		break;
	case SIOCGIFADDR:
	case SIOCSIFADDR: //该命令在xioctl中被调用
	case SIOCGIFBRDADDR:
	case SIOCSIFBRDADDR:
	case SIOCGIFNETMASK:
	case SIOCSIFNETMASK:
	case SIOCGIFDSTADDR:
	case SIOCSIFDSTADDR:
	case SIOCSIFPFLAGS:
	case SIOCGIFPFLAGS:
	case SIOCSIFFLAGS:
		err = devinet_ioctl(net, cmd, (void __user *)arg);
		break;
	default:
		if (sk->sk_prot->ioctl)
			err = sk->sk_prot->ioctl(sk, cmd, arg);
		else
			err = -ENOIOCTLCMD;
		break;
	}
	return err;
}
EXPORT_SYMBOL(inet_ioctl);

通过在busybox中调用的命令 SIOCSIFADDR,知调用接口函数devinet_ioctl()

int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
	struct ifreq ifr;
	struct sockaddr_in sin_orig;
	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
	struct in_device *in_dev;
	struct in_ifaddr **ifap = NULL;
	struct in_ifaddr *ifa = NULL;
	struct net_device *dev;
	char *colon;
	int ret = -EFAULT;
	int tryaddrmatch = 0;

	/*
	 *	Fetch the caller's info block into kernel space
	 */

	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
		goto out;
	ifr.ifr_name[IFNAMSIZ - 1] = 0;

	/* save original address for comparison */
	memcpy(&sin_orig, sin, sizeof(*sin));

	colon = strchr(ifr.ifr_name, ':');
	if (colon)
		*colon = 0;

	dev_load(net, ifr.ifr_name);

	switch (cmd) {
	case SIOCGIFADDR:	/* Get interface address */
	case SIOCGIFBRDADDR:	/* Get the broadcast address */
	case SIOCGIFDSTADDR:	/* Get the destination address */
	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
		/* Note that these ioctls will not sleep,
		   so that we do not impose a lock.
		   One day we will be forced to put shlock here (I mean SMP)
		 */
		tryaddrmatch = (sin_orig.sin_family == AF_INET);
		memset(sin, 0, sizeof(*sin));
		sin->sin_family = AF_INET;
		break;

	case SIOCSIFFLAGS:
		ret = -EPERM;
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
			goto out;
		break;
	case SIOCSIFADDR:	/* Set interface address (and family) */
	case SIOCSIFBRDADDR:	/* Set the broadcast address */
	case SIOCSIFDSTADDR:	/* Set the destination address */
	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
		ret = -EPERM;
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
			goto out;
		ret = -EINVAL;
		if (sin->sin_family != AF_INET)
			goto out;
		break;
	default:
		ret = -EINVAL;
		goto out;
	}

	rtnl_lock();

	ret = -ENODEV;
	dev = __dev_get_by_name(net, ifr.ifr_name); //通过名称name(如eth0)获取设备dev
	if (!dev)
		goto done;

	if (colon)
		*colon = ':';

	in_dev = __in_dev_get_rtnl(dev); //通过struct net_device获取其成员struct in_device结构体
	if (in_dev) {
		if (tryaddrmatch) {
			/* Matthias Andree */
			/* compare label and address (4.4BSD style) */
			/* note: we only do this for a limited set of ioctls
			   and only if the original address family was AF_INET.
			   This is checked above. */
			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; //遍历该设备上的地址
			     ifap = &ifa->ifa_next) {
				if (!strcmp(ifr.ifr_name, ifa->ifa_label) && //名称相同(如eth0)
				    sin_orig.sin_addr.s_addr ==
							ifa->ifa_local) { //本地地址相同
					break; /* found */
				}
			}
		}
		/* we didn't get a match, maybe the application is
		   4.3BSD-style and passed in junk so we fall back to
		   comparing just the label */
		if (!ifa) { //表示不匹配
			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
			     ifap = &ifa->ifa_next)
				if (!strcmp(ifr.ifr_name, ifa->ifa_label)) //只比较标签是否相同
					break;
		}
	}

	ret = -EADDRNOTAVAIL;
	if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
		goto done;

	switch (cmd) {
	case SIOCGIFADDR:	/* Get interface address */
		sin->sin_addr.s_addr = ifa->ifa_local;
		goto rarok;

	case SIOCGIFBRDADDR:	/* Get the broadcast address */
		sin->sin_addr.s_addr = ifa->ifa_broadcast;
		goto rarok;

	case SIOCGIFDSTADDR:	/* Get the destination address */
		sin->sin_addr.s_addr = ifa->ifa_address;
		goto rarok;

	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
		sin->sin_addr.s_addr = ifa->ifa_mask;
		goto rarok;

	case SIOCSIFFLAGS: //将调用这里 SIOCSIFFLAGS
		if (colon) {
			ret = -EADDRNOTAVAIL;
			if (!ifa)
				break;
			ret = 0;
			if (!(ifr.ifr_flags & IFF_UP))
				inet_del_ifa(in_dev, ifap, 1);
			break;
		}
		ret = dev_change_flags(dev, ifr.ifr_flags); //设备改变标识
		break;

	case SIOCSIFADDR:	/* Set interface address (and family) */   //到这里,设置地址
		ret = -EINVAL;
		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
			break;

		if (!ifa) {
			ret = -ENOBUFS;
			ifa = inet_alloc_ifa();
			if (!ifa)
				break;
			INIT_HLIST_NODE(&ifa->hash);
			if (colon)
				memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
			else
				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
		} else {
			ret = 0;
			if (ifa->ifa_local == sin->sin_addr.s_addr) //本地地址相同就退出
				break;
			inet_del_ifa(in_dev, ifap, 0);
			ifa->ifa_broadcast = 0;
			ifa->ifa_scope = 0;
		}

		ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; //初始化本地地址

		if (!(dev->flags & IFF_POINTOPOINT)) { //类型不为point to point
			ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
			ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
			if ((dev->flags & IFF_BROADCAST) &&
			    ifa->ifa_prefixlen < 31)
				ifa->ifa_broadcast = ifa->ifa_address |
						     ~ifa->ifa_mask;
		} else {
			ifa->ifa_prefixlen = 32;
			ifa->ifa_mask = inet_make_mask(32);
		}
		set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); //更新时间
		ret = inet_set_ifa(dev, ifa); //重要
		break;

	case SIOCSIFBRDADDR:	/* Set the broadcast address */
		ret = 0;
		if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
			inet_del_ifa(in_dev, ifap, 0);
			ifa->ifa_broadcast = sin->sin_addr.s_addr;
			inet_insert_ifa(ifa);
		}
		break;

	case SIOCSIFDSTADDR:	/* Set the destination address */
		ret = 0;
		if (ifa->ifa_address == sin->sin_addr.s_addr)
			break;
		ret = -EINVAL;
		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
			break;
		ret = 0;
		inet_del_ifa(in_dev, ifap, 0);
		ifa->ifa_address = sin->sin_addr.s_addr;
		inet_insert_ifa(ifa); //设置地址
		break;

	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */

		/*
		 *	The mask we set must be legal.
		 */
		ret = -EINVAL;
		if (bad_mask(sin->sin_addr.s_addr, 0))
			break;
		ret = 0;
		if (ifa->ifa_mask != sin->sin_addr.s_addr) {
			__be32 old_mask = ifa->ifa_mask;
			inet_del_ifa(in_dev, ifap, 0);
			ifa->ifa_mask = sin->sin_addr.s_addr;
			ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);

			/* See if current broadcast address matches
			 * with current netmask, then recalculate
			 * the broadcast address. Otherwise it's a
			 * funny address, so don't touch it since
			 * the user seems to know what (s)he's doing...
			 */
			if ((dev->flags & IFF_BROADCAST) &&
			    (ifa->ifa_prefixlen < 31) &&
			    (ifa->ifa_broadcast ==
			     (ifa->ifa_local|~old_mask))) {
				ifa->ifa_broadcast = (ifa->ifa_local |
						      ~sin->sin_addr.s_addr);
			}
			inet_insert_ifa(ifa);
		}
		break;
	}
done:
	rtnl_unlock();
out:
	return ret;
rarok:
	rtnl_unlock();
	ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
	goto out;
}

在 devinet_ioctl涉及到三个重要的接口函数,分别如下

a. __dev_get_by_name() 通过接口名称(如eth0)获取其设备

dev = __dev_get_by_name(net, ifr.ifr_name); //通过名称name(如eth0)获取设备dev

struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
	struct net_device *dev;
	struct hlist_head *head = dev_name_hash(net, name); //通过设备名称计算hash值,从而获取链表头指针

	hlist_for_each_entry(dev, head, name_hlist)
		if (!strncmp(dev->name, name, IFNAMSIZ)) //名称是否相同,如eth0
			return dev;

	return NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);

static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));

	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}

b. __in_dev_get_rtnl() 通过设备获取struct in_device结构体

in_dev = __in_dev_get_rtnl(dev); //通过struct net_device获取其成员struct in_device结构体

static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
{
	return rtnl_dereference(dev->ip_ptr);
}

c. 通过命令SIOCSIFADDR,知将调用inet_set_ifa()接口函数

static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
	struct in_device *in_dev = __in_dev_get_rtnl(dev); 

	ASSERT_RTNL();

	if (!in_dev) {
		inet_free_ifa(ifa); //释放其接口
		return -ENOBUFS;
	}
	ipv4_devconf_setall(in_dev);
	if (ifa->ifa_dev != in_dev) {  //接口设备不一致就强制转换
		WARN_ON(ifa->ifa_dev);
		in_dev_hold(in_dev);
		ifa->ifa_dev = in_dev;
	}
	if (ipv4_is_loopback(ifa->ifa_local))
		ifa->ifa_scope = RT_SCOPE_HOST;
	return inet_insert_ifa(ifa); //插入接口
}
static int inet_insert_ifa(struct in_ifaddr *ifa)
{
	return __inet_insert_ifa(ifa, NULL, 0);
}
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
			     u32 portid)
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct in_ifaddr *ifa1, **ifap, **last_primary;

	ASSERT_RTNL();

	if (!ifa->ifa_local) {
		inet_free_ifa(ifa);
		return 0;
	}

	ifa->ifa_flags &= ~IFA_F_SECONDARY;
	last_primary = &in_dev->ifa_list;

	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
	     ifap = &ifa1->ifa_next) {
		if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
		    ifa->ifa_scope <= ifa1->ifa_scope)
			last_primary = &ifa1->ifa_next;
		if (ifa1->ifa_mask == ifa->ifa_mask &&
		    inet_ifa_match(ifa1->ifa_address, ifa)) {
			if (ifa1->ifa_local == ifa->ifa_local) {
				inet_free_ifa(ifa);
				return -EEXIST;
			}
			if (ifa1->ifa_scope != ifa->ifa_scope) {
				inet_free_ifa(ifa);
				return -EINVAL;
			}
			ifa->ifa_flags |= IFA_F_SECONDARY;
		}
	}

	if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
		net_srandom(ifa->ifa_local);
		ifap = last_primary;
	}

	ifa->ifa_next = *ifap;
	*ifap = ifa;

	inet_hash_insert(dev_net(in_dev->dev), ifa); //将接口ifa加入到链表inet_addr_lst[]中

	cancel_delayed_work(&check_lifetime_work);
	schedule_delayed_work(&check_lifetime_work, 0);

	/* Send message first, then call notifier.
	   Notifier will trigger FIB update, so that
	   listeners of netlink will know about new ifaddr */
	rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); //注意是RTM_NEWADDR,表示的地址,而RTM_NEROUTE表示的是路由
	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); //发送消息

	return 0;
}

在该函数内部主要完成几个重要的功能:

a. inet_hash_insert(dev_net(in_dev->dev), ifa); //将接口ifa加入到链表inet_addr_lst[]中;

b. rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); //注意是RTM_NEWADDR,表示的是地址,而RTM_NEROUTE表示的是路由;

关于RTM_NEWADDR命令的注册,详见第4点

c. blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); //唤醒通知链inetaddr_chain,命令为NETDEV_UP

关于该通知链的的注册部分详见第5点

4. RTM_NEWADDR 消息注册

路径:ip_rt_init()->devinet_init()

void __init devinet_init(void)
{
	int i;

	for (i = 0; i < IN4_ADDR_HSIZE; i++)
		INIT_HLIST_HEAD(&inet_addr_lst[i]); //初始化数组链表inet_addr_lst
	
	//注册一个网络子系统,详见该函数的定义
	register_pernet_subsys(&devinet_ops);

	//注册获取网络配置的接口的回调函数 SIOCGIF,并将inet_gifconf添加到指针数组 gifconf_list上
	register_gifconf(PF_INET, inet_gifconf);
	
	//注册网络设备通知链,将ip_netdev_notifier添加到netdev_chain通知连上
	register_netdevice_notifier(&ip_netdev_notifier);
	
	//调度延时工作
	schedule_delayed_work(&check_lifetime_work, 0);

	//注册一个rtnetlink inet_af_ops到链表rtnl_af_ops上
	rtnl_af_register(&inet_af_ops);
	
	//在注册了一些流量控制操作后如果要使用,就要在用户空间使用命令行工具配置
    //然后和内核交互,告诉内核使用新的或改变一些流量控制操作(也就是改变了流量控制算法)
    //下面就是为通过 rtnetlink 和用户交互而注册的函数和交互类型
	//特别注意,这些函数内部会调用上面的inet_addr_lst链表上,rtnetlink的文章详见:https://www.cnblogs.com/wenqiang/p/6634447.html
	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); 
	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
	rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
		      inet_netconf_dump_devconf, NULL);
}

在该函数的最后部分分别注册了RTM_NEWADDR、RTM_DELADDR、RTM_GETADDR、RTM_GETNETCONF的消息处理回调函数,这里只分析RTM_NEWADDR对应的回调函数inet_rtm_newaddr():

rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); 

static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
	struct net *net = sock_net(skb->sk);
	struct in_ifaddr *ifa;
	struct in_ifaddr *ifa_existing;
	__u32 valid_lft = INFINITY_LIFE_TIME;
	__u32 prefered_lft = INFINITY_LIFE_TIME;

	ASSERT_RTNL();

	ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); //初始化nlh
	if (IS_ERR(ifa))
		return PTR_ERR(ifa);

	ifa_existing = find_matching_ifa(ifa); //判定其接口是否存在
	if (!ifa_existing) { //不存在
		/* It would be best to check for !NLM_F_CREATE here but
		 * userspace alreay relies on not having to provide this.
		 */
		set_ifa_lifetime(ifa, valid_lft, prefered_lft); //设置ifa的时间
		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); //设置接口
	} else { //存在
		inet_free_ifa(ifa); //到这里表示存在,释放ifa接口

		if (nlh->nlmsg_flags & NLM_F_EXCL ||
		    !(nlh->nlmsg_flags & NLM_F_REPLACE))
			return -EEXIST;
		ifa = ifa_existing;
		set_ifa_lifetime(ifa, valid_lft, prefered_lft);
		cancel_delayed_work(&check_lifetime_work);
		schedule_delayed_work(&check_lifetime_work, 0);
		rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); //发送 RTM_NEWADDR 消息
		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); //唤醒通知链
	}
	return 0;
}

在该函数内部最后执行存在两种情况,第一是调用__inet_insert_ifa()设置其接口,第二是继续调用RTM_NEWADDR和通知链inetaddr_chain(关于该链表详见第5),这里又是回调函数了....(虽然是回调函数,但因为会遍历地址,地址是会结束的,所以不会总嵌套下去)

 

5. inetaddr_chain通知链接收消息处理

在第3.c中最后将通过 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa)发送消息,而该消息接收部分的处理如下

void __init ip_fib_init(void)
{
	//注册路由接口处理(新建、删除、获取路由)
	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);

	register_pernet_subsys(&fib_net_ops); //RT_TABLE_LOCAL、RT_TABLE_MAIN路由表的初始化、proc文件系统给的初始化
	register_netdevice_notifier(&fib_netdev_notifier); //fib网络设备的通知链netdev_chain注册
	register_inetaddr_notifier(&fib_inetaddr_notifier); //fib inet地址通知链inetaddr_chain注册

	fib_trie_init(); //fn_alias_kmem、trie_leaf_kmem TR-C算法内存初始化
}

在ip_fib_init()函数内部主要完成的功能包括:

a. RTM_NEWROUTE、RTM_DELROUTE、RTM_GETROUTE路由消息的注册

	//注册路由接口处理(新建、删除、获取路由)
	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);

在该内部会用到inet_rtm_newroute接口函数,具体下面会分析到

b. RT_TABLE_LOCAL、RT_TABLE_MAIN路由表的初始化、proc文件系统给的初始化

register_pernet_subsys(&fib_net_ops);

static struct pernet_operations fib_net_ops = {
	.init = fib_net_init,
	.exit = fib_net_exit,
};

static int __net_init fib_net_init(struct net *net)
{
	int error;

#ifdef CONFIG_IP_ROUTE_CLASSID
	net->ipv4.fib_num_tclassid_users = 0;
#endif
	error = ip_fib_net_init(net); //fib表初始化,包括RT_TABLE_LOCAL、RT_TABLE_MAIN对应的链表空间分配
	if (error < 0)
		goto out;
	error = nl_fib_lookup_init(net); //netlink fib查找初始化
	if (error < 0)
		goto out_nlfl;
	error = fib_proc_init(net); //fib proc文件系统初始化
	if (error < 0)
		goto out_proc;
out:
	return error;

out_proc:
	nl_fib_lookup_exit(net);
out_nlfl:
	ip_fib_net_exit(net);
	goto out;
}

c. fib网络设备的通知链netdev_chain注册

register_netdevice_notifier(&fib_netdev_notifier); //fib网络设备的通知链netdev_chain注册

int register_netdevice_notifier(struct notifier_block *nb)
{
    //...
	err = raw_notifier_chain_register(&netdev_chain, nb);
    //...
}
EXPORT_SYMBOL(register_netdevice_notifier);

 

d. fib inet地址通知链inetaddr_chain注册

register_inetaddr_notifier(&fib_inetaddr_notifier); //fib inet地址通知链inetaddr_chain注册

int register_inetaddr_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_register(&inetaddr_chain, nb);
}

e. fib_trie_init()

fib_trie_init(); //fn_alias_kmem、trie_leaf_kmem TR-C算法内存初始化

void __init fib_trie_init(void)
{
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
}

到这里已经分析了ip_fib_init()函数内部的各个模块,现在回头再来看下第5.点需要分析的问题,blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa)的处理模块,即fib_netdev_notifier函数

register_netdevice_notifier(&fib_netdev_notifier); //fib网络设备的通知链netdev_chain注册

static struct notifier_block fib_netdev_notifier = {
	.notifier_call = fib_netdev_event,
};

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
	struct net_device *dev = ptr;
	struct in_device *in_dev;
	struct net *net = dev_net(dev);

	if (event == NETDEV_UNREGISTER) {
		fib_disable_ip(dev, 2);
		rt_flush_dev(dev);
		return NOTIFY_DONE;
	}

	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev)
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP: //将调用该事件
		for_ifa(in_dev) {
			fib_add_ifaddr(ifa);
		} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		fib_sync_up(dev);
#endif
		atomic_inc(&net->ipv4.dev_addr_genid);
		rt_cache_flush(net);
		break;
	case NETDEV_DOWN:
		fib_disable_ip(dev, 0);
		break;
	case NETDEV_CHANGEMTU:
	case NETDEV_CHANGE:
		rt_cache_flush(net);
		break;
	}
	return NOTIFY_DONE;
}

本次的事件为NETDEV_UP,所以会调用上面的函数fib_add_ifaddr()

void fib_add_ifaddr(struct in_ifaddr *ifa)
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *prim = ifa;
	__be32 mask = ifa->ifa_mask;
	__be32 addr = ifa->ifa_local;
	__be32 prefix = ifa->ifa_address & mask;

	if (ifa->ifa_flags & IFA_F_SECONDARY) {
		prim = inet_ifa_byprefix(in_dev, prefix, mask);
		if (prim == NULL) {
			pr_warn("%s: bug: prim == NULL\n", __func__);
			return;
		}
	}

	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); //将调用RTM_NEWROUTE-RTN_LOCAL

	if (!(dev->flags & IFF_UP))
		return;

	/* Add broadcast address, if it is explicitly assigned. */
	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);

	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
		fib_magic(RTM_NEWROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  prefix, ifa->ifa_prefixlen, prim);

		/* Add network specific broadcasts, when it takes a sense */
		if (ifa->ifa_prefixlen < 31) {
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
				  32, prim);
		}
	}
}

在该函数内部将调用 如下函数

fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); //将调用RTM_NEWROUTE-RTN_LOCAL

static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
	struct net *net = dev_net(ifa->ifa_dev->dev);
	struct fib_table *tb;
	struct fib_config cfg = {
		.fc_protocol = RTPROT_KERNEL,
		.fc_type = type,
		.fc_dst = dst,
		.fc_dst_len = dst_len,
		.fc_prefsrc = ifa->ifa_local,
		.fc_oif = ifa->ifa_dev->dev->ifindex,
		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
		.fc_nlinfo = {
			.nl_net = net,
		},
	};

	if (type == RTN_UNICAST)
		tb = fib_new_table(net, RT_TABLE_MAIN);
	else
		tb = fib_new_table(net, RT_TABLE_LOCAL);

	if (tb == NULL)
		return;

	cfg.fc_table = tb->tb_id;

	if (type != RTN_LOCAL)
		cfg.fc_scope = RT_SCOPE_LINK;
	else
		cfg.fc_scope = RT_SCOPE_HOST;

	if (cmd == RTM_NEWROUTE)
		fib_table_insert(tb, &cfg);
	else
		fib_table_delete(tb, &cfg);
}

该函数先调用fib_new_table()查找fib_table表,然后根据命令类型是添加或删除路由,我们这里是添加路由,所以会调用fib_table_insert()接口函数(详见第6点),先来分析下fib_new_table()

tb = fib_new_table(net, cfg.fc_table); //新建一个 fid_table表

struct fib_table *fib_new_table(struct net *net, u32 id)
{
	struct fib_table *tb;
	unsigned int h;

	if (id == 0)
		id = RT_TABLE_MAIN;
	tb = fib_get_table(net, id); //检索tb是否被加入到id对应的链表(如RT_TABLE_LOCAL链表)上,被加入就直接退出,否则将执行 fib_trie_table
	if (tb)
		return tb;

	tb = fib_trie_table(id); //内存申请一个 fib_table
	if (!tb)
		return NULL;

	switch (id) {
	case RT_TABLE_LOCAL:
		net->ipv4.fib_local = tb;
		break;

	case RT_TABLE_MAIN:
		net->ipv4.fib_main = tb;
		break;

	case RT_TABLE_DEFAULT:
		net->ipv4.fib_default = tb;
		break;

	default:
		break;
	}

	h = id & (FIB_TABLE_HASHSZ - 1);
	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);//将tb(struct fib_table *tb)添加到其链表上
	return tb;
}

//其中fib_get_table源码如下
//通过形参id,匹配hash链表,成功就返回tb,否则NULL
struct fib_table *fib_get_table(struct net *net, u32 id)
{
	struct fib_table *tb;
	struct hlist_head *head;
	unsigned int h;

	if (id == 0)
		id = RT_TABLE_MAIN;
	h = id & (FIB_TABLE_HASHSZ - 1); //h = id & 0xff

	rcu_read_lock();
	//关于fib_table_hash[*]的创建,详见:https://blog.csdn.net/guodong1010/article/details/52245555
	head = &net->ipv4.fib_table_hash[h]; //看下这里是什么时候赋值的,在 fib_new_table 函数内部初始化链表的
	hlist_for_each_entry_rcu(tb, head, tb_hlist) { //遍历 net->ipv4.fib_table_hash 链表,寻找匹配成功的路由表id
		if (tb->tb_id == id) {
			rcu_read_unlock();
			return tb;
		}
	}
	rcu_read_unlock();
	return NULL;
}

//fib_trie_table函数如下
struct fib_table *fib_trie_table(u32 id)
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
	tb->tb_default = -1;
	tb->tb_num_default = 0;

	t = (struct trie *) tb->tb_data;
	memset(t, 0, sizeof(*t));

	return tb;
}

 

6. fib_table_insert() fib_table路由表添加

int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
	struct list_head *fa_head = NULL;
	struct fib_info *fi;
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

	key = ntohl(cfg->fc_dst);

	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);

	mask = ntohl(inet_make_mask(plen));

	if (key & ~mask)
		return -EINVAL;

	key = key & mask;

	fi = fib_create_info(cfg); //分配一个struct fib_info结构体
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
		goto err;
	}

	l = fib_find_node(t, key);  //通过关键字key查找leaf
	fa = NULL;

	if (l) { //l为真表示叶子存在
		fa_head = get_fa_head(l, plen); //通过leaf->leaf_info->fa_alias获取其链表头
		fa = fib_find_alias(fa_head, tos, fi->fib_priority); //通过表头fa_head遍历是否存在相同的fa
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) { //表明存在相同的fa
		struct fib_alias *fa_first, *fa_match;

		err = -EEXIST;
		if (cfg->fc_nlflags & NLM_F_EXCL)
			goto out;

		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

		if (cfg->fc_nlflags & NLM_F_REPLACE) { //存在,替换原来的
			struct fib_info *fi_drop;
			u8 state;

			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
				goto out; //上面匹配成功就直接退出,否则要新建一个new_fa
			}
			err = -ENOBUFS;
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
			if (new_fa == NULL)
				goto out;

			fi_drop = fa->fa_info;
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
			new_fa->fa_type = cfg->fc_type;
			state = fa->fa_state;
			new_fa->fa_state = state & ~FA_S_ACCESSED;

			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);

			goto succeeded;
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
		if (fa_match) //匹配成功就退出
			goto out;

		if (!(cfg->fc_nlflags & NLM_F_APPEND))
			fa = fa_first;
	}
	err = -ENOENT;
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
		goto out;

	err = -ENOBUFS;
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); //到这里表明上面没有找到相同的fa,需重新申请一个新的
	if (new_fa == NULL)
		goto out;

	//初始化fib_alias结构体
	new_fa->fa_info = fi; //绑定上面分配的fi(fib_info)
	//关键字绑定
	new_fa->fa_tos = tos;
	new_fa->fa_type = cfg->fc_type;
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

	if (!fa_head) { //为NULL,表明是第一次执行
		fa_head = fib_insert_node(t, key, plen); //插入一个节点,内部的实现还未理顺,待分析中,核心部分!!!
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
			goto out_free_new_fa;
		}
	}

	if (!plen)
		tb->tb_num_default++;

	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));

	rt_cache_flush(cfg->fc_nlinfo.nl_net);
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
		  &cfg->fc_nlinfo, 0);
succeeded:
	return 0;

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
out:
	fib_release_info(fi);
err:
	return err;
}

在其函数内部有两个非常重要函数要分析

a. fa_head = fib_insert_node(t, key, plen); //插入一个节点,内部的实现还未理顺(包括几个重要的函数:tkey_sub_equals() tkey_extract_bits()  tnode_get_child()),待分析中,核心部分!!!

static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
	struct rt_trie_node *n;
	struct leaf *l;
	int missbit;
	struct list_head *fa_head = NULL;
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
	n = rtnl_dereference(t->trie);

	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
	 * and we should just put our new leaf in that.
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
	 * not be the parent's 'pos'+'bits'!
	 *
	 * If it does match the current key, get pos/bits from it, extract
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;

		check_tnode(tn);

		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
			tp = tn;
			pos = tn->pos + tn->bits;
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));

			BUG_ON(n && node_parent(n) != tn);
		} else
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
	 * tp is n's (parent) ----> NULL or TNODE
	 */

	BUG_ON(tp && IS_LEAF(tp));

	/* Case 1: n is a leaf. Compare prefixes */

	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
		l = (struct leaf *) n;
		li = leaf_info_new(plen); //申请一个新的leaf_info

		if (!li)
			return NULL;

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li); //将li(leaf_info)添加到l->list(leaf)链表上
		goto done;
	}
	l = leaf_new(); //新建一个leaf

	if (!l)
		return NULL;

	l->key = key;
	li = leaf_info_new(plen); //新建一个leaf_info

	if (!li) {
		free_leaf(l);
		return NULL;
	}

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li); //将li(leaf_info)添加到l->list(leaf)上

	if (t->trie && n == NULL) {
		/* Case 2: n is NULL, and will just insert a new leaf */

		node_set_parent((struct rt_trie_node *)l, tp); //设置当前leaf的父节点

		cindex = tkey_extract_bits(key, tp->pos, tp->bits); //计算索引 
		put_child(tp, cindex, (struct rt_trie_node *)l); //通过索引,将leaf添加到tp节点上,其实就是将leaf添加到 tn->child[i]
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
		/*
		 *  Add a new tnode here
		 *  first tnode need some special handling
		 */

		if (tp)
			pos = tp->pos+tp->bits;
		else
			pos = 0;

		if (n) {
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
		} else {
			newpos = 0;
			tn = tnode_new(key, newpos, 1); /* First tnode */
		}

		if (!tn) {
			free_leaf_info(li);
			free_leaf(l);
			return NULL;
		}

		node_set_parent((struct rt_trie_node *)tn, tp);

		missbit = tkey_extract_bits(key, newpos, 1);
		put_child(tn, missbit, (struct rt_trie_node *)l);
		put_child(tn, 1-missbit, n);

		if (tp) {
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
			put_child(tp, cindex, (struct rt_trie_node *)tn);
		} else {
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
			tp = tn;
		}
	}

	if (tp && tp->pos + tp->bits > 32)
		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			tp, tp->pos, tp->bits, key, plen);

	/* Rebalance the trie */

	trie_rebalance(t, tp);
done:
	return fa_head;
}

b. 路由事件RTM_NEWROUTE发送之后的接收处理

事件发送:

rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
          &cfg->fc_nlinfo, 0)

事件接收(在第5.c中有提到):

	//注册路由接口处理(新建、删除、获取路由)
	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);

所以这里会调用 inet_rtm_newroute()接口函数,并将fib_config路由信息添加到路由表上

static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
	struct net *net = sock_net(skb->sk);
	struct fib_config cfg;
	struct fib_table *tb;
	int err;

	err = rtm_to_fib_config(net, skb, nlh, &cfg); //cfg(fib_cofig结构体)初始化
	if (err < 0)
		goto errout;

	tb = fib_new_table(net, cfg.fc_table); //新建一个 fid_table表
	if (tb == NULL) {
		err = -ENOBUFS;
		goto errout;
	}

	err = fib_table_insert(tb, &cfg); //将cfg添加到fib_table表上
errout:
	return err;
}

特别注意,在fib_table_insert()函数内部会继续调用如下的函数

	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
		  &cfg->fc_nlinfo, 0);

嵌套调用,这里与之前分析的地址嵌套调用是一致的。

7. 总结

      本博客涉及到一些比较核心的功能代码还未具体分析(如通过pos+bits计算地址索引),不过针对应用层ifconfig所触发的内核内部流程及机制已分析的很明确,这正是我需要的,框架流程弄明白了,具体细节就只是时间问题!

又到深夜,我家宝宝出生第19天,Happy & life.

 

问题处理:

这里有个疑问,相应的消息将通过RTM_NEWROUTE命令进行发送,而接受该消息的处理函数是前面分析到的 inet_rtm_newroute()函数,难道这里会执行多次嵌套调用???-------已经解决

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值