socket fs(2)

/************************************************************************************/

socketfs的创建

sock_init ->
{
    register_filesystem(&sock_fs_type);
    sock_mnt = kern_mount(&sock_fs_type);
}
static struct file_system_type sock_fs_type = {
    .name =        "sockfs",
    .mount =    sockfs_mount,
    .kill_sb =    kill_anon_super,
};

kern_mount -> kern_mount_data -> vfs_kern_mount -> mount_fs ->type->mount();
static struct dentry *sockfs_mount(struct file_system_type *fs_type,
             int flags, const char *dev_name, void *data)
{
    return mount_pseudo(fs_type, "socket:", &sockfs_ops,
        &sockfs_dentry_operations, SOCKFS_MAGIC);
}

static const struct super_operations sockfs_ops = {
    .alloc_inode    = sock_alloc_inode,
    .destroy_inode    = sock_destroy_inode,
    .statfs        = simple_statfs,
};

static const struct dentry_operations sockfs_dentry_operations = {
    .d_dname  = sockfs_dname,
};


/*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
    const struct super_operations *ops,
    const struct dentry_operations *dops, unsigned long magic)
{
    struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
    struct dentry *dentry;
    struct inode *root;
    struct qstr d_name = {.name = name, .len = strlen(name)};

    if (IS_ERR(s))
        return ERR_CAST(s);

    s->s_flags = MS_NOUSER;
    s->s_maxbytes = MAX_LFS_FILESIZE;
    s->s_blocksize = PAGE_SIZE;
    s->s_blocksize_bits = PAGE_SHIFT;
    s->s_magic = magic;
    s->s_op = ops ? ops : &simple_super_operations;
    s->s_time_gran = 1;
    root = new_inode(s);
    if (!root)
        goto Enomem;
    /*
     * since this is the first inode, make it number 1. New inodes created
     * after this must take care not to collide with it (by passing
     * max_reserved of 1 to iunique).
     */
    root->i_ino = 1;
    root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
    root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
    dentry = __d_alloc(s, &d_name);
    if (!dentry) {
        iput(root);
        goto Enomem;
    }
    d_instantiate(dentry, root);
    s->s_root = dentry;
    s->s_d_op = dops;
    s->s_flags |= MS_ACTIVE;
    return dget(s->s_root);

Enomem:
    deactivate_locked_super(s);
    return ERR_PTR(-ENOMEM);
}

/****************************************************************************************/

socket系统调用的实现:

主要有两部分:创建socket/ 关联socket and 文件描述符
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
    int retval;
    struct socket *sock;
    int flags;

    /* Check the SOCK_* constants for consistency.  */
    BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
    BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

    flags = type & ~SOCK_TYPE_MASK;
    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
        return -EINVAL;
    type &= SOCK_TYPE_MASK;

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    retval = sock_create(family, type, protocol, &sock);
    if (retval < 0)
        goto out;

    retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
    if (retval < 0)
        goto out_release;

out:
    /* It may be already another descriptor 8) Not kernel problem. */
    return retval;

out_release:
    sock_release(sock);
    return retval;
}


创建socket

/*sock_create的实现,这里 name space proxy其中的net name space,就是说这里关联到net*/
int sock_create(int family, int type, int protocol, struct socket **res)
{
    return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

crash> task_struct | grep nsproxy
    struct nsproxy *nsproxy;
crash> nsproxy
struct nsproxy {
    atomic_t count;
    struct uts_namespace *uts_ns;
    struct ipc_namespace *ipc_ns;
    struct mnt_namespace *mnt_ns;
    struct pid_namespace *pid_ns;
    struct net *net_ns;
}
SIZE: 24

crash> struct net
struct net {
    atomic_t passive;
    atomic_t count;
    spinlock_t rules_mod_lock;
    struct list_head list;
    struct list_head cleanup_list;
    struct list_head exit_list;
    struct proc_dir_entry *proc_net;
    struct proc_dir_entry *proc_net_stat;
    struct ctl_table_set sysctls;
    struct sock *rtnl;
    struct sock *genl_sock;
    struct list_head dev_base_head;
    struct hlist_head *dev_name_head;
    struct hlist_head *dev_index_head;
    unsigned int dev_base_seq;
    struct list_head rules_ops;
    struct net_device *loopback_dev;
    struct netns_core core;
    struct netns_mib mib;
    struct netns_packet packet;
    struct netns_unix unx;
    struct netns_ipv4 ipv4;
    struct netns_ipv6 ipv6;
    struct netns_xt xt;
    struct netns_ct ct;
    struct sock *nfnl;
    struct sock *nfnl_stash;
    struct sk_buff_head wext_nlevents;
    struct net_generic *gen;
    struct netns_xfrm xfrm;
    struct netns_ipvs *ipvs;
}
SIZE: 1376

/*__sock_create创建了socket,调用对应net family的create 函数*/
crash> socket
struct socket {
    socket_state state;
    short type;
    unsigned long flags;
    struct socket_wq *wq;
    struct file *file;
    struct sock *sk;
    const struct proto_ops *ops;
}
SIZE: 28

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    struct socket *sock;
    const struct net_proto_family *pf;

    /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
     */
    sock = sock_alloc();
    sock->type = type;
    pf = rcu_dereference(net_families[family]);
    err = pf->create(net, sock, protocol, kern);
    *res = sock;

    return 0;
}

crash> net_proto_family
struct net_proto_family {
    int family;
    int (*create)(struct net *, struct socket *, int, int);
    struct module *owner;
}

crash> net_families
net_families = $1 =
{0x0,
0xc0561d3c <unix_family_ops>,
0xc0560954 <inet_family_ops>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0xc0561eb0 <inet6_family_ops>,
0x0, 0x0, 0x0, 0x0,
0xc0563458 <pfkey_family_ops>,
0xc055f384 <netlink_family_ops>,
0xc05632e0 <packet_family_ops>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0xc0543358 <pppox_proto_family>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0xc0703c8c <bt_sock_family_ops>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0
}


关联socket and file

int sock_map_fd(struct socket *sock, int flags)
{
    struct file *newfile;
    int fd = sock_alloc_file(sock, &newfile, flags);

    if (likely(fd >= 0))
        fd_install(fd, newfile);

    return fd;
}


/*
 *    Obtains the first available file descriptor and sets it up for use.
 *
 *    These functions create file structures and maps them to fd space
 *    of the current process. On success it returns file descriptor
 *    and file struct implicitly stored in sock->file.
 *    Note that another thread may close file descriptor before we return
 *    from this function. We use the fact that now we do not refer
 *    to socket after mapping. If one day we will need it, this
 *    function will increment ref. count on file by 1.
 *
 *    In any case returned fd MAY BE not valid!
 *    This race condition is unavoidable
 *    with shared fd spaces, we cannot solve it inside kernel,
 *    but we take care of internal coherence yet.
 */

static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
{
    struct qstr name = { .name = "" };
    struct path path;
    struct file *file;
    int fd;

    fd = get_unused_fd_flags(flags);

    path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);

    path.mnt = mntget(sock_mnt);

    d_instantiate(path.dentry, SOCK_INODE(sock));
    / *inode的 fops赋值为socket_file_ops*/
    SOCK_INODE(sock)->i_fop = &socket_file_ops;/*inode fops*/

    file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
          &socket_file_ops);


    sock->file = file;
    file->f_flags = O_RDWR | (flags & O_NONBLOCK);
    file->f_pos = 0;/*is NULL*/
    file->private_data = sock;

    *f = file;
    return fd;
}

/*
 *    Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *    in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    .llseek =    no_llseek,
    .aio_read =    sock_aio_read,
    .aio_write =    sock_aio_write,
    .poll =        sock_poll,
    .unlocked_ioctl = sock_ioctl,

    .mmap =        sock_mmap,
    .open =        sock_no_open,    /* special open code to disallow open via /proc */
    .release =    sock_close,
    .fasync =    sock_fasync,
    .sendpage =    sock_sendpage,
    .splice_write = generic_splice_sendpage,
    .splice_read =    sock_splice_read,
};

/*
 * Install a file pointer in the fd array.
 */
void fd_install(unsigned int fd, struct file *file)
{
    struct files_struct *files = current->files;
    struct fdtable *fdt;
    spin_lock(&files->file_lock);
    fdt = files_fdtable(files);
    BUG_ON(fdt->fd[fd] != NULL);
    rcu_assign_pointer(fdt->fd[fd], file);
    spin_unlock(&files->file_lock);
}

/************************************************************************************/

以#define AF_NETLINK    16 为例看socket的创建过程:

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    struct socket *sock;
    const struct net_proto_family *pf;

    /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
     */
    sock = sock_alloc();
    sock->type = type;
    pf = rcu_dereference(net_families[family]);
    err = pf->create(net, sock, protocol, kern);
    *res = sock;

    return 0;
}

以0xc055f384 <netlink_family_ops>,
#define AF_NETLINK    16

crash> netlink_family_ops
netlink_family_ops = $11 = {
  family = 16,
  create = 0xc03f29b8 <netlink_create>,
  owner = 0x0
}


static int netlink_create(struct net *net, struct socket *sock, int protocol,
              int kern)
{
    struct module *module = NULL;
    struct mutex *cb_mutex;
    struct netlink_sock *nlk;
    int err = 0;

    sock->state = SS_UNCONNECTED;


    err = __netlink_create(net, sock, cb_mutex, protocol);

    local_bh_disable();
    sock_prot_inuse_add(net, &netlink_proto, 1);
    local_bh_enable();

    nlk = nlk_sk(sock->sk);
    nlk->module = module;

    return err;

}

static int __netlink_create(struct net *net, struct socket *sock,
                struct mutex *cb_mutex, int protocol)
{
    struct sock *sk;
    struct netlink_sock *nlk;

    sock->ops = &netlink_ops;

    sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
    if (!sk)
        return -ENOMEM;

    sock_init_data(sock, sk);

    nlk = nlk_sk(sk);
    if (cb_mutex)
        nlk->cb_mutex = cb_mutex;
    else {
        nlk->cb_mutex = &nlk->cb_def_mutex;
        mutex_init(nlk->cb_mutex);
    }
    init_waitqueue_head(&nlk->wait);

    sk->sk_destruct = netlink_sock_destruct;
    sk->sk_protocol = protocol;
    return 0;
}

static const struct proto_ops netlink_ops = {
    .family =    PF_NETLINK,
    .owner =    THIS_MODULE,
    .release =    netlink_release,
    .bind =        netlink_bind,
    .connect =    netlink_connect,
    .socketpair =    sock_no_socketpair,
    .accept =    sock_no_accept,
    .getname =    netlink_getname,
    .poll =        datagram_poll,
    .ioctl =    sock_no_ioctl,
    .listen =    sock_no_listen,
    .shutdown =    sock_no_shutdown,
    .setsockopt =    netlink_setsockopt,
    .getsockopt =    netlink_getsockopt,
    .sendmsg =    netlink_sendmsg,
    .recvmsg =    netlink_recvmsg,
    .mmap =        sock_no_mmap,
    .sendpage =    sock_no_sendpage,

};

socket read system call


上图对应的代码流程

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct file *file;
    ssize_t ret = -EBADF;
    int fput_needed;

    file = fget_light(fd, &fput_needed);/*从fd得到file object*/
    if (file) {
        loff_t pos = file_pos_read(file);/*从哪里开始read*/
        ret = vfs_read(file, buf, count, &pos);
        file_pos_write(file, pos);
        fput_light(file, fput_needed);
    }

    return ret;
}



ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))
        return -EBADF;
    if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
        return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count);
    if (ret >= 0) {
        count = ret;
        if (file->f_op->read)
            ret = file->f_op->read(file, buf, count, pos);
        else
            ret = do_sync_read(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_access(file);
            add_rchar(current, ret);
        }
        inc_syscr(current);
    }

    return ret;
}


ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    ssize_t ret;

    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    kiocb.ki_left = len;
    kiocb.ki_nbytes = len;

    for (;;) {
        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
        if (ret != -EIOCBRETRY)
            break;
        wait_on_retry_sync_kiocb(&kiocb);
    }

    if (-EIOCBQUEUED == ret)
        ret = wait_on_sync_kiocb(&kiocb);
    *ppos = kiocb.ki_pos;
    return ret;
}

/*sockfs*/
.aio_read =    sock_aio_read,

static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
{
    struct sock_iocb siocb, *x;

    if (pos != 0)
        return -ESPIPE;

    if (iocb->ki_left == 0)    /* Match SYS5 behaviour */
        return 0;


    x = alloc_sock_iocb(iocb, &siocb);
    if (!x)
        return -ENOMEM;
    return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
}

static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
        struct file *file, const struct iovec *iov,
        unsigned long nr_segs)
{
    struct socket *sock = file->private_data;
    size_t size = 0;
    int i;

    for (i = 0; i < nr_segs; i++)
        size += iov[i].iov_len;

    msg->msg_name = NULL;
    msg->msg_namelen = 0;
    msg->msg_control = NULL;
    msg->msg_controllen = 0;
    msg->msg_iov = (struct iovec *)iov;
    msg->msg_iovlen = nr_segs;
    msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;

    return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
}

static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
                 struct msghdr *msg, size_t size, int flags)
{
    int err = security_socket_recvmsg(sock, msg, size, flags);

    return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
}

static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
                       struct msghdr *msg, size_t size, int flags)
{
    struct sock_iocb *si = kiocb_to_siocb(iocb);

    sock_update_classid(sock->sk);

    si->sock = sock;
    si->scm = NULL;
    si->msg = msg;
    si->size = size;
    si->flags = flags;

    return sock->ops->recvmsg(iocb, sock, msg, size, flags);
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值