raw socket按照检测到的报文主要可以分两类:mac报文、IP报文、传输层报文。本文分析二层报文,例如:socket(AF_PACKET, SOCK_RAW, ETH_P_ALL)方式创建的socket,可以检测到所有的二层报文。raw socket实现的核心在于,socket的建立,并注册到相应的数据中,实现在收包阶段把报文提交给socket处理,例如ptype_all、ptype_base数组。
1、sys_socket函数
-
SYSCALL_DEFINE3(socket,
int, family,
int, type,
int, protocol)
-
{
-
int retval;
-
struct socket *sock;
-
int flags;
-
-
/* Check the SOCK_* constants for consistency. */
-
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
-
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
-
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
-
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
-
-
flags = type & ~SOCK_TYPE_MASK;
-
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
-
return -EINVAL;
-
type &= SOCK_TYPE_MASK;
-
-
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
-
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
-
-
retval = sock_create(family, type, protocol, &sock);
//创建socket对象
-
if (retval <
0)
-
goto out;
-
-
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
-
if (retval <
0)
-
goto out_release;
-
-
out:
-
/* It may be already another descriptor 8) Not kernel problem. */
-
return retval;
-
-
out_release:
-
sock_release(sock);
-
return retval;
-
}
2、sock_create函数
-
int sock_create(int family, int type, int protocol, struct socket **res)
-
{
-
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res,
0);
-
}
3、__sock_create函数
-
int __sock_create(struct net *net,
int family,
int type,
int protocol,
-
struct socket **res,
int kern)
-
{
-
int err;
-
struct socket *sock;
-
const
struct net_proto_family *pf;
-
-
/*
-
* Check protocol is in range
-
*/
-
if (family <
0 || family >= NPROTO)
-
return -EAFNOSUPPORT;
-
if (type <
0 || type >= SOCK_MAX)
-
return -EINVAL;
-
-
/* Compatibility.
-
-
This uglymoron is moved from INET layer to here to avoid
-
deadlock in module load.
-
*/
-
if (family == PF_INET && type == SOCK_PACKET) {
-
static
int warned;
-
if (!warned) {
-
warned =
1;
-
pr_info(
"%s uses obsolete (PF_INET,SOCK_PACKET)\n",
-
current->comm);
-
}
-
family = PF_PACKET;
-
}
-
-
err = security_socket_create(family, type, protocol, kern);
-
if (err)
-
return err;
-
-
/*
-
* Allocate the socket and allow the family to set things up. if
-
* the protocol is 0, the family is instructed to select an appropriate
-
* default.
-
*/
-
sock = sock_alloc();
//申请socket对象
-
if (!sock) {
-
net_warn_ratelimited(
"socket: no more sockets\n");
-
return -ENFILE;
/* Not exactly a match, but its the
-
closest posix thing */
-
}
-
-
sock->type = type;
//设置socket的type
-
-
#ifdef CONFIG_MODULES
-
/* Attempt to load a protocol module if the find failed.
-
*
-
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
-
* requested real, full-featured networking support upon configuration.
-
* Otherwise module support will break!
-
*/
-
if (rcu_access_pointer(net_families[family]) ==
NULL)
//得到family对应的net_proto_family
-
request_module(
"net-pf-%d", family);
-
#endif
-
-
rcu_read_lock();
-
pf = rcu_dereference(net_families[family]);
-
err = -EAFNOSUPPORT;
-
if (!pf)
-
goto out_release;
-
-
/*
-
* We will call the ->create function, that possibly is in a loadable
-
* module, so we have to bump that loadable module refcnt first.
-
*/
-
if (!try_module_get(pf->owner))
-
goto out_release;
-
-
/* Now protected by module ref count */
-
rcu_read_unlock();
-
-
err = pf->create(net, sock, protocol, kern);
//调用net_proto_family的create,实现收包,AF_PACKET对应的是packet_family_ops
-
if (err <
0)
-
goto out_module_put;
-
-
/*
-
* Now to bump the refcnt of the [loadable] module that owns this
-
* socket at sock_release time we decrement its refcnt.
-
*/
-
if (!try_module_get(sock->ops->owner))
-
goto out_module_busy;
-
-
/*
-
* Now that we're done with the ->create function, the [loadable]
-
* module can have its refcnt decremented
-
*/
-
module_put(pf->owner);
-
err = security_socket_post_create(sock, family, type, protocol, kern);
-
if (err)
-
goto out_sock_release;
-
*res = sock;
-
-
return
0;
-
-
out_module_busy:
-
err = -EAFNOSUPPORT;
-
out_module_put:
-
sock->ops =
NULL;
-
module_put(pf->owner);
-
out_sock_release:
-
sock_release(sock);
-
return err;
-
-
out_release:
-
rcu_read_unlock();
-
goto out_sock_release;
-
}
4、packet_create函数
-
static int packet_create(struct net *net, struct socket *sock, int protocol,
-
int kern)
-
{
-
struct sock *sk;
-
struct packet_sock *po;
-
__be16 proto = (__force __be16)protocol;
/* weird, but documented */
-
int err;
-
-
if (!ns_capable(net->user_ns, CAP_NET_RAW))
-
return -EPERM;
-
if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
//仅支持这三类socket
-
sock->type != SOCK_PACKET)
-
return -ESOCKTNOSUPPORT;
-
-
sock->state = SS_UNCONNECTED;
-
-
err = -ENOBUFS;
-
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
//创建sock对象
-
if (sk ==
NULL)
-
goto out;
-
-
sock->ops = &packet_ops;
-
if (sock->type == SOCK_PACKET)
-
sock->ops = &packet_ops_spkt;
//SOCK_PACKET类型的socket和其他的两种有差别
-
-
sock_init_data(sock, sk);
//初始化sock对象
-
-
po = pkt_sk(sk);
-
sk->sk_family = PF_PACKET;
-
po->num = proto;
-
po->xmit = dev_queue_xmit;
//发送报文的方式是直接二层发包,需要用户构建完整的二层报文
-
-
err = packet_alloc_pending(po);
-
if (err)
-
goto out2;
-
-
packet_cached_dev_reset(po);
-
-
sk->sk_destruct = packet_sock_destruct;
-
sk_refcnt_debug_inc(sk);
-
-
/*
-
* Attach a protocol block
-
*/
-
-
spin_lock_init(&po->bind_lock);
-
mutex_init(&po->pg_vec_lock);
-
po->prot_hook.func = packet_rcv;
//raw socket使用此函数,二层收包时调用该func收包
-
-
if (sock->type == SOCK_PACKET)
//SOCK_PACKET类型的socket与其他两种不同
-
po->prot_hook.func = packet_rcv_spkt;
-
-
po->prot_hook.af_packet_priv = sk;
-
-
if (proto) {
-
po->prot_hook.type = proto;
//协议类型,匹配到协议类型时,会调用packet_rcv函数
-
register_prot_hook(sk);
//注册到协议处理中,这是实现抓包的关键,注册到ptype_all或ptype_base
-
}
-
-
mutex_lock(&net->packet.sklist_lock);
-
sk_add_node_rcu(sk, &net->packet.sklist);
-
mutex_unlock(&net->packet.sklist_lock);
-
-
preempt_disable();
-
sock_prot_inuse_add(net, &packet_proto,
1);
-
preempt_enable();
-
-
return
0;
-
out2:
-
sk_free(sk);
-
out:
-
return err;
-
}
5、register_prot_hook函数
-
static void register_prot_hook(struct sock *sk)
-
{
-
struct packet_sock *po = pkt_sk(sk);
-
-
if (!po->running) {
-
if (po->fanout)
-
__fanout_link(sk, po);
-
else
-
dev_add_pack(&po->prot_hook);
//注册到ptype_all或ptype_base中
-
-
sock_hold(sk);
-
po->running =
1;
-
}
-
}
到这里可以知道,AF_PACKET raw socket收包实现是通过在ptype_all或ptype_base中注册实现的,但是此时skb的数据已经是指向IP头了,但是用户收到的报文是包含mac头的,那说明是在func函数中实现,接下来看下raw socket的func函数的实现。
packet_rcv函数
-
/*
-
* This function makes lazy skb cloning in hope that most of packets
-
* are discarded by BPF.
-
*
-
* Note tricky part: we DO mangle shared skb! skb->data, skb->len
-
* and skb->cb are mangled. It works because (and until) packets
-
* falling here are owned by current CPU. Output packets are cloned
-
* by dev_queue_xmit_nit(), input packets are processed by net_bh
-
* sequencially, so that if we return skb to original state on exit,
-
* we will not harm anyone.
-
*/
-
-
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
-
struct packet_type *pt, struct net_device *orig_dev)
-
{
-
struct sock *sk;
-
struct sockaddr_ll *sll;
-
struct packet_sock *po;
-
u8 *skb_head = skb->data;
-
int skb_len = skb->len;
-
unsigned
int snaplen, res;
-
-
if (skb->pkt_type == PACKET_LOOPBACK)
//loopback报文不处理
-
goto drop;
-
-
sk = pt->af_packet_priv;
//在packet_type中保存sock对象
-
po = pkt_sk(sk);
-
-
if (!net_eq(dev_net(dev), sock_net(sk)))
//检测是否在同一个namespace中
-
goto drop;
-
-
skb->dev = dev;
-
-
if (dev->header_ops) {
//常见设备都包含
-
/* The device has an explicit notion of ll header,
-
* exported to higher levels.
-
*
-
* Otherwise, the device hides details of its frame
-
* structure, so that corresponding packet head is
-
* never delivered to user.
-
*/
-
if (sk->sk_type != SOCK_DGRAM)
-
skb_push(skb, skb->data - skb_mac_header(skb));
//raw socket和packet socket都会提交给用户二层报文
-
else
if (skb->pkt_type == PACKET_OUTGOING) {
-
/* Special case: outgoing packets have ll header at head */
-
skb_pull(skb, skb_network_offset(skb));
-
}
-
}
-
-
snaplen = skb->len;
-
-
res = run_filter(skb, sk, snaplen);
-
if (!res)
-
goto drop_n_restore;
-
if (snaplen > res)
-
snaplen = res;
-
-
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
-
goto drop_n_acct;
-
-
if (skb_shared(skb)) {
-
struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
-
if (nskb ==
NULL)
-
goto drop_n_acct;
-
-
if (skb_head != skb->data) {
-
skb->data = skb_head;
-
skb->len = skb_len;
-
}
-
consume_skb(skb);
-
skb = nskb;
-
}
-
-
sock_skb_cb_check_size(
sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN -
8);
-
-
sll = &PACKET_SKB_CB(skb)->sa.ll;
-
sll->sll_hatype = dev->type;
-
sll->sll_pkttype = skb->pkt_type;
-
if (unlikely(po->origdev))
-
sll->sll_ifindex = orig_dev->ifindex;
-
else
-
sll->sll_ifindex = dev->ifindex;
-
-
sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
-
-
/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
-
* Use their space for storing the original skb length.
-
*/
-
PACKET_SKB_CB(skb)->sa.origlen = skb->len;
-
-
if (pskb_trim(skb, snaplen))
-
goto drop_n_acct;
-
-
skb_set_owner_r(skb, sk);
-
skb->dev =
NULL;
-
skb_dst_drop(skb);
-
-
/* drop conntrack reference */
-
nf_reset(skb);
-
-
spin_lock(&sk->sk_receive_queue.lock);
-
po->stats.stats1.tp_packets++;
-
sock_skb_set_dropcount(sk, skb);
-
__skb_queue_tail(&sk->sk_receive_queue, skb);
//skb保存到sock对象的接收队列中
-
spin_unlock(&sk->sk_receive_queue.lock);
-
sk->sk_data_ready(sk);
//唤醒睡眠在该sock的进程或线程,该线程唤醒后会从接收队列中获取报文进行处理
-
return
0;
-
-
drop_n_acct:
-
spin_lock(&sk->sk_receive_queue.lock);
-
po->stats.stats1.tp_drops++;
-
atomic_inc(&sk->sk_drops);
-
spin_unlock(&sk->sk_receive_queue.lock);
-
-
drop_n_restore:
-
if (skb_head != skb->data && skb_shared(skb)) {
-
skb->data = skb_head;
-
skb->len = skb_len;
-
}
-
drop:
-
consume_skb(skb);
-
return
0;
-
}