AF_PACKET raw socket实现原理分析

raw socket按照检测到的报文主要可以分两类:mac报文、IP报文、传输层报文。本文分析二层报文,例如:socket(AF_PACKET, SOCK_RAW, ETH_P_ALL)方式创建的socket,可以检测到所有的二层报文。raw socket实现的核心在于,socket的建立,并注册到相应的数据中,实现在收包阶段把报文提交给socket处理,例如ptype_all、ptype_base数组。

1、sys_socket函数


  
  
  1. SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
  2. {
  3. int retval;
  4. struct socket *sock;
  5. int flags;
  6. /* Check the SOCK_* constants for consistency. */
  7. BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  8. BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  9. BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  10. BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
  11. flags = type & ~SOCK_TYPE_MASK;
  12. if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
  13. return -EINVAL;
  14. type &= SOCK_TYPE_MASK;
  15. if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
  16. flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
  17. retval = sock_create(family, type, protocol, &sock); //创建socket对象
  18. if (retval < 0)
  19. goto out;
  20. retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
  21. if (retval < 0)
  22. goto out_release;
  23. out:
  24. /* It may be already another descriptor 8) Not kernel problem. */
  25. return retval;
  26. out_release:
  27. sock_release(sock);
  28. return retval;
  29. }
2、sock_create函数


  
  
  1. int sock_create(int family, int type, int protocol, struct socket **res)
  2. {
  3. return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
  4. }
3、__sock_create函数


  
  
  1. int __sock_create(struct net *net, int family, int type, int protocol,
  2. struct socket **res, int kern)
  3. {
  4. int err;
  5. struct socket *sock;
  6. const struct net_proto_family *pf;
  7. /*
  8. * Check protocol is in range
  9. */
  10. if (family < 0 || family >= NPROTO)
  11. return -EAFNOSUPPORT;
  12. if (type < 0 || type >= SOCK_MAX)
  13. return -EINVAL;
  14. /* Compatibility.
  15. This uglymoron is moved from INET layer to here to avoid
  16. deadlock in module load.
  17. */
  18. if (family == PF_INET && type == SOCK_PACKET) {
  19. static int warned;
  20. if (!warned) {
  21. warned = 1;
  22. pr_info( "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
  23. current->comm);
  24. }
  25. family = PF_PACKET;
  26. }
  27. err = security_socket_create(family, type, protocol, kern);
  28. if (err)
  29. return err;
  30. /*
  31. * Allocate the socket and allow the family to set things up. if
  32. * the protocol is 0, the family is instructed to select an appropriate
  33. * default.
  34. */
  35. sock = sock_alloc(); //申请socket对象
  36. if (!sock) {
  37. net_warn_ratelimited( "socket: no more sockets\n");
  38. return -ENFILE; /* Not exactly a match, but its the
  39. closest posix thing */
  40. }
  41. sock->type = type; //设置socket的type
  42. #ifdef CONFIG_MODULES
  43. /* Attempt to load a protocol module if the find failed.
  44. *
  45. * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
  46. * requested real, full-featured networking support upon configuration.
  47. * Otherwise module support will break!
  48. */
  49. if (rcu_access_pointer(net_families[family]) == NULL) //得到family对应的net_proto_family
  50. request_module( "net-pf-%d", family);
  51. #endif
  52. rcu_read_lock();
  53. pf = rcu_dereference(net_families[family]);
  54. err = -EAFNOSUPPORT;
  55. if (!pf)
  56. goto out_release;
  57. /*
  58. * We will call the ->create function, that possibly is in a loadable
  59. * module, so we have to bump that loadable module refcnt first.
  60. */
  61. if (!try_module_get(pf->owner))
  62. goto out_release;
  63. /* Now protected by module ref count */
  64. rcu_read_unlock();
  65. err = pf->create(net, sock, protocol, kern); //调用net_proto_family的create,实现收包,AF_PACKET对应的是packet_family_ops
  66. if (err < 0)
  67. goto out_module_put;
  68. /*
  69. * Now to bump the refcnt of the [loadable] module that owns this
  70. * socket at sock_release time we decrement its refcnt.
  71. */
  72. if (!try_module_get(sock->ops->owner))
  73. goto out_module_busy;
  74. /*
  75. * Now that we're done with the ->create function, the [loadable]
  76. * module can have its refcnt decremented
  77. */
  78. module_put(pf->owner);
  79. err = security_socket_post_create(sock, family, type, protocol, kern);
  80. if (err)
  81. goto out_sock_release;
  82. *res = sock;
  83. return 0;
  84. out_module_busy:
  85. err = -EAFNOSUPPORT;
  86. out_module_put:
  87. sock->ops = NULL;
  88. module_put(pf->owner);
  89. out_sock_release:
  90. sock_release(sock);
  91. return err;
  92. out_release:
  93. rcu_read_unlock();
  94. goto out_sock_release;
  95. }
4、packet_create函数


  
  
  1. static int packet_create(struct net *net, struct socket *sock, int protocol,
  2. int kern)
  3. {
  4. struct sock *sk;
  5. struct packet_sock *po;
  6. __be16 proto = (__force __be16)protocol; /* weird, but documented */
  7. int err;
  8. if (!ns_capable(net->user_ns, CAP_NET_RAW))
  9. return -EPERM;
  10. if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && //仅支持这三类socket
  11. sock->type != SOCK_PACKET)
  12. return -ESOCKTNOSUPPORT;
  13. sock->state = SS_UNCONNECTED;
  14. err = -ENOBUFS;
  15. sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); //创建sock对象
  16. if (sk == NULL)
  17. goto out;
  18. sock->ops = &packet_ops;
  19. if (sock->type == SOCK_PACKET)
  20. sock->ops = &packet_ops_spkt; //SOCK_PACKET类型的socket和其他的两种有差别
  21. sock_init_data(sock, sk); //初始化sock对象
  22. po = pkt_sk(sk);
  23. sk->sk_family = PF_PACKET;
  24. po->num = proto;
  25. po->xmit = dev_queue_xmit; //发送报文的方式是直接二层发包,需要用户构建完整的二层报文
  26. err = packet_alloc_pending(po);
  27. if (err)
  28. goto out2;
  29. packet_cached_dev_reset(po);
  30. sk->sk_destruct = packet_sock_destruct;
  31. sk_refcnt_debug_inc(sk);
  32. /*
  33. * Attach a protocol block
  34. */
  35. spin_lock_init(&po->bind_lock);
  36. mutex_init(&po->pg_vec_lock);
  37. po->prot_hook.func = packet_rcv; //raw socket使用此函数,二层收包时调用该func收包
  38. if (sock->type == SOCK_PACKET) //SOCK_PACKET类型的socket与其他两种不同
  39. po->prot_hook.func = packet_rcv_spkt;
  40. po->prot_hook.af_packet_priv = sk;
  41. if (proto) {
  42. po->prot_hook.type = proto; //协议类型,匹配到协议类型时,会调用packet_rcv函数
  43. register_prot_hook(sk); //注册到协议处理中,这是实现抓包的关键,注册到ptype_all或ptype_base
  44. }
  45. mutex_lock(&net->packet.sklist_lock);
  46. sk_add_node_rcu(sk, &net->packet.sklist);
  47. mutex_unlock(&net->packet.sklist_lock);
  48. preempt_disable();
  49. sock_prot_inuse_add(net, &packet_proto, 1);
  50. preempt_enable();
  51. return 0;
  52. out2:
  53. sk_free(sk);
  54. out:
  55. return err;
  56. }
5、register_prot_hook函数


  
  
  1. static void register_prot_hook(struct sock *sk)
  2. {
  3. struct packet_sock *po = pkt_sk(sk);
  4. if (!po->running) {
  5. if (po->fanout)
  6. __fanout_link(sk, po);
  7. else
  8. dev_add_pack(&po->prot_hook); //注册到ptype_all或ptype_base中
  9. sock_hold(sk);
  10. po->running = 1;
  11. }
  12. }

到这里可以知道,AF_PACKET raw socket收包实现是通过在ptype_all或ptype_base中注册实现的,但是此时skb的数据已经是指向IP头了,但是用户收到的报文是包含mac头的,那说明是在func函数中实现,接下来看下raw socket的func函数的实现。


packet_rcv函数


  
  
  1. /*
  2. * This function makes lazy skb cloning in hope that most of packets
  3. * are discarded by BPF.
  4. *
  5. * Note tricky part: we DO mangle shared skb! skb->data, skb->len
  6. * and skb->cb are mangled. It works because (and until) packets
  7. * falling here are owned by current CPU. Output packets are cloned
  8. * by dev_queue_xmit_nit(), input packets are processed by net_bh
  9. * sequencially, so that if we return skb to original state on exit,
  10. * we will not harm anyone.
  11. */
  12. static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
  13. struct packet_type *pt, struct net_device *orig_dev)
  14. {
  15. struct sock *sk;
  16. struct sockaddr_ll *sll;
  17. struct packet_sock *po;
  18. u8 *skb_head = skb->data;
  19. int skb_len = skb->len;
  20. unsigned int snaplen, res;
  21. if (skb->pkt_type == PACKET_LOOPBACK) //loopback报文不处理
  22. goto drop;
  23. sk = pt->af_packet_priv; //在packet_type中保存sock对象
  24. po = pkt_sk(sk);
  25. if (!net_eq(dev_net(dev), sock_net(sk))) //检测是否在同一个namespace中
  26. goto drop;
  27. skb->dev = dev;
  28. if (dev->header_ops) { //常见设备都包含
  29. /* The device has an explicit notion of ll header,
  30. * exported to higher levels.
  31. *
  32. * Otherwise, the device hides details of its frame
  33. * structure, so that corresponding packet head is
  34. * never delivered to user.
  35. */
  36. if (sk->sk_type != SOCK_DGRAM)
  37. skb_push(skb, skb->data - skb_mac_header(skb)); //raw socket和packet socket都会提交给用户二层报文
  38. else if (skb->pkt_type == PACKET_OUTGOING) {
  39. /* Special case: outgoing packets have ll header at head */
  40. skb_pull(skb, skb_network_offset(skb));
  41. }
  42. }
  43. snaplen = skb->len;
  44. res = run_filter(skb, sk, snaplen);
  45. if (!res)
  46. goto drop_n_restore;
  47. if (snaplen > res)
  48. snaplen = res;
  49. if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
  50. goto drop_n_acct;
  51. if (skb_shared(skb)) {
  52. struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
  53. if (nskb == NULL)
  54. goto drop_n_acct;
  55. if (skb_head != skb->data) {
  56. skb->data = skb_head;
  57. skb->len = skb_len;
  58. }
  59. consume_skb(skb);
  60. skb = nskb;
  61. }
  62. sock_skb_cb_check_size( sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
  63. sll = &PACKET_SKB_CB(skb)->sa.ll;
  64. sll->sll_hatype = dev->type;
  65. sll->sll_pkttype = skb->pkt_type;
  66. if (unlikely(po->origdev))
  67. sll->sll_ifindex = orig_dev->ifindex;
  68. else
  69. sll->sll_ifindex = dev->ifindex;
  70. sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
  71. /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
  72. * Use their space for storing the original skb length.
  73. */
  74. PACKET_SKB_CB(skb)->sa.origlen = skb->len;
  75. if (pskb_trim(skb, snaplen))
  76. goto drop_n_acct;
  77. skb_set_owner_r(skb, sk);
  78. skb->dev = NULL;
  79. skb_dst_drop(skb);
  80. /* drop conntrack reference */
  81. nf_reset(skb);
  82. spin_lock(&sk->sk_receive_queue.lock);
  83. po->stats.stats1.tp_packets++;
  84. sock_skb_set_dropcount(sk, skb);
  85. __skb_queue_tail(&sk->sk_receive_queue, skb); //skb保存到sock对象的接收队列中
  86. spin_unlock(&sk->sk_receive_queue.lock);
  87. sk->sk_data_ready(sk); //唤醒睡眠在该sock的进程或线程,该线程唤醒后会从接收队列中获取报文进行处理
  88. return 0;
  89. drop_n_acct:
  90. spin_lock(&sk->sk_receive_queue.lock);
  91. po->stats.stats1.tp_drops++;
  92. atomic_inc(&sk->sk_drops);
  93. spin_unlock(&sk->sk_receive_queue.lock);
  94. drop_n_restore:
  95. if (skb_head != skb->data && skb_shared(skb)) {
  96. skb->data = skb_head;
  97. skb->len = skb_len;
  98. }
  99. drop:
  100. consume_skb(skb);
  101. return 0;
  102. }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值