From: Eduardo Habkost <ehabkost@redhat.com> Date: Thu, 28 Aug 2008 19:07:35 -0300 Subject: [fs] anon_inodes implementation Message-id: 20080828220735.GA17182@blackpad O-Subject: [RHEL5.3 PATCH] anon_inodes implementation Bugzilla: 459835 RH-Acked-by: Chris Wright <chrisw@redhat.com> RH-Acked-by: Rik van Riel <riel@redhat.com> RH-Acked-by: Glauber Costa <glommer@redhat.com> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=459835 anon_inodes is necessary for KVM. KVM includes a compat module for anon_inodes when compiling as an external module, but including a anon_inodes module closer to upstream instead of a compat module, we will be able to remove the compat hacks from the KVM module and use a native fs/anon_inodes.c implementation. Using the compat anon_inodes.c as starting point, I did the following to get an implementation as close as possible to upstream anon_inodes.c. - Remove differences on the content of comments (mostly typos that are fixed upstream) - Removed the Linux version #ifdefs - Use a proper fs_initcall() init function instead of a hack to initialize it at KVM module load. - Remove the igrab() call, like the code upstream, as anon_inode_inode will always have a non-zero reference count - Fill the file->f_flags O_NONBLOCK flag according to the 'flags' arg, like upstream The result is a fs/anon_inodes.c that is as close as possible to upstream, except for a few differences, due to differences between RHEL5 kernel and upstream: - Use get_empty_filp() because alloc_file() isn't available - get_unused_fd() because get_unused_fd_flags() isn't available - current->[fg]suid because current_[fg]suid() isn't available For reference, below is the diff between upstream fs/anon_inodes.c and the resulting file added by this patch. > --- /home/ehabkost/code/kernel/linux-2.6/fs/anon_inodes.c 2008-08-19 15:36:40.000000000 -0300 > +++ fs/anon_inodes.c 2008-08-28 18:17:54.000000000 -0300 > @@ -78,10 +78,13 @@ > > if (IS_ERR(anon_inode_inode)) > return -ENODEV; > + file = get_empty_filp(); > + if (!file) > + return -ENFILE; > > - error = get_unused_fd_flags(flags); > + error = get_unused_fd(); > if (error < 0) > - return error; > + goto err_put_filp; > fd = error; > > /* > @@ -108,15 +111,14 @@ > dentry->d_flags &= ~DCACHE_UNHASHED; > d_instantiate(dentry, anon_inode_inode); > > - error = -ENFILE; > - file = alloc_file(anon_inode_mnt, dentry, > - FMODE_READ | FMODE_WRITE, fops); > - if (!file) > - goto err_dput; > + file->f_vfsmnt = mntget(anon_inode_mnt); > + file->f_dentry = dentry; > file->f_mapping = anon_inode_inode->i_mapping; > > file->f_pos = 0; > file->f_flags = O_RDWR | (flags & O_NONBLOCK); > + file->f_op = (struct file_operations *)fops; > + file->f_mode = FMODE_READ | FMODE_WRITE; > file->f_version = 0; > file->private_data = priv; > > @@ -124,10 +126,10 @@ > > return fd; > > -err_dput: > - dput(dentry); > err_put_unused_fd: > put_unused_fd(fd); > +err_put_filp: > + fput(file); > return error; > } > EXPORT_SYMBOL_GPL(anon_inode_getfd); > @@ -154,8 +156,8 @@ > */ > inode->i_state = I_DIRTY; > inode->i_mode = S_IRUSR | S_IWUSR; > - inode->i_uid = current_fsuid(); > - inode->i_gid = current_fsgid(); > + inode->i_uid = current->fsuid; > + inode->i_gid = current->fsgid; > inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; > return inode; > } Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> diff --git a/fs/Makefile b/fs/Makefile index e6b3fc3..6bc3688 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,6 +16,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o +obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c new file mode 100644 index 0000000..65ccc4c --- /dev/null +++ b/fs/anon_inodes.c @@ -0,0 +1,194 @@ +/* + * fs/anon_inodes.c + * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * + * Thanks to Arnd Bergmann for code review and suggestions. + * More changes for Thomas Gleixner suggestions. + * + */ + +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/magic.h> +#include <linux/anon_inodes.h> + +#include <asm/uaccess.h> + +static struct vfsmount *anon_inode_mnt __read_mostly; +static struct inode *anon_inode_inode; +static const struct file_operations anon_inode_fops; + +static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC, + mnt); +} + +static int anon_inodefs_delete_dentry(struct dentry *dentry) +{ + /* + * We faked vfs to believe the dentry was hashed when we created it. + * Now we restore the flag so that dput() will work correctly. + */ + dentry->d_flags |= DCACHE_UNHASHED; + return 1; +} + +static struct file_system_type anon_inode_fs_type = { + .name = "anon_inodefs", + .get_sb = anon_inodefs_get_sb, + .kill_sb = kill_anon_super, +}; +static struct dentry_operations anon_inodefs_dentry_operations = { + .d_delete = anon_inodefs_delete_dentry, +}; + +/** + * anon_inode_getfd - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * Creates a new file by hooking it on a single inode. This is useful for files + * that do not need to have a full-fledged inode in order to operate correctly. + * All the files created with anon_inode_getfd() will share a single inode, + * hence saving memory and avoiding code duplication for the file/inode/dentry + * setup. Returns new descriptor or -error. + */ +int anon_inode_getfd(const char *name, const struct file_operations *fops, + void *priv, int flags) +{ + struct qstr this; + struct dentry *dentry; + struct file *file; + int error, fd; + + if (IS_ERR(anon_inode_inode)) + return -ENODEV; + file = get_empty_filp(); + if (!file) + return -ENFILE; + + error = get_unused_fd(); + if (error < 0) + goto err_put_filp; + fd = error; + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode sequence number. + */ + error = -ENOMEM; + this.name = name; + this.len = strlen(name); + this.hash = 0; + dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); + if (!dentry) + goto err_put_unused_fd; + + /* + * We know the anon_inode inode count is always greater than zero, + * so we can avoid doing an igrab() and we can use an open-coded + * atomic_inc(). + */ + atomic_inc(&anon_inode_inode->i_count); + + dentry->d_op = &anon_inodefs_dentry_operations; + /* Do not publish this dentry inside the global dentry hash table */ + dentry->d_flags &= ~DCACHE_UNHASHED; + d_instantiate(dentry, anon_inode_inode); + + file->f_vfsmnt = mntget(anon_inode_mnt); + file->f_dentry = dentry; + file->f_mapping = anon_inode_inode->i_mapping; + + file->f_pos = 0; + file->f_flags = O_RDWR | (flags & O_NONBLOCK); + file->f_op = (struct file_operations *)fops; + file->f_mode = FMODE_READ | FMODE_WRITE; + file->f_version = 0; + file->private_data = priv; + + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); +err_put_filp: + fput(file); + return error; +} +EXPORT_SYMBOL_GPL(anon_inode_getfd); + +/* + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. + */ +static struct inode *anon_inode_mkinode(void) +{ + struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); + + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_fop = &anon_inode_fops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because mark_inode_dirty() will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + return inode; +} + +static int __init anon_inode_init(void) +{ + int error; + + error = register_filesystem(&anon_inode_fs_type); + if (error) + goto err_exit; + anon_inode_mnt = kern_mount(&anon_inode_fs_type); + if (IS_ERR(anon_inode_mnt)) { + error = PTR_ERR(anon_inode_mnt); + goto err_unregister_filesystem; + } + anon_inode_inode = anon_inode_mkinode(); + if (IS_ERR(anon_inode_inode)) { + error = PTR_ERR(anon_inode_inode); + goto err_mntput; + } + + return 0; + +err_mntput: + mntput(anon_inode_mnt); +err_unregister_filesystem: + unregister_filesystem(&anon_inode_fs_type); +err_exit: + panic(KERN_ERR "anon_inode_init() failed (%d)\n", error); +} + +fs_initcall(anon_inode_init); + diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h new file mode 100644 index 0000000..1595b9d --- /dev/null +++ b/include/linux/anon_inodes.h @@ -0,0 +1,16 @@ +/* + * include/linux/anon_inodes.h + * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * + */ + +#ifndef _LINUX_ANON_INODES_H +#define _LINUX_ANON_INODES_H + +struct file_operations; + +int anon_inode_getfd(const char *name, const struct file_operations *fops, + void *priv, int flags); + +#endif /* _LINUX_ANON_INODES_H */ diff --git a/include/linux/magic.h b/include/linux/magic.h new file mode 100644 index 0000000..74e68e2 --- /dev/null +++ b/include/linux/magic.h @@ -0,0 +1,46 @@ +#ifndef __LINUX_MAGIC_H__ +#define __LINUX_MAGIC_H__ + +#define ADFS_SUPER_MAGIC 0xadf5 +#define AFFS_SUPER_MAGIC 0xadff +#define AFS_SUPER_MAGIC 0x5346414F +#define AUTOFS_SUPER_MAGIC 0x0187 +#define CODA_SUPER_MAGIC 0x73757245 +#define EFS_SUPER_MAGIC 0x414A53 +#define EXT2_SUPER_MAGIC 0xEF53 +#define EXT3_SUPER_MAGIC 0xEF53 +#define EXT4_SUPER_MAGIC 0xEF53 +#define HPFS_SUPER_MAGIC 0xf995e849 +#define ISOFS_SUPER_MAGIC 0x9660 +#define JFFS2_SUPER_MAGIC 0x72b6 +#define ANON_INODE_FS_MAGIC 0x09041934 + +#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */ +#define MINIX_SUPER_MAGIC2 0x138F /* minix fs, 30 char names */ +#define MINIX2_SUPER_MAGIC 0x2468 /* minix V2 fs */ +#define MINIX2_SUPER_MAGIC2 0x2478 /* minix V2 fs, 30 char names */ +#define MINIX3_SUPER_MAGIC 0x4d5a /* minix V3 fs */ + +#define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ +#define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ +#define NFS_SUPER_MAGIC 0x6969 +#define OPENPROM_SUPER_MAGIC 0x9fa1 +#define PROC_SUPER_MAGIC 0x9fa0 +#define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ + +#define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */ + /* used by file system utilities that + look at the superblock, etc. */ +#define REISERFS_SUPER_MAGIC_STRING "ReIsErFs" +#define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" +#define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" + +#define SMB_SUPER_MAGIC 0x517B +#define USBDEVICE_SUPER_MAGIC 0x9fa2 +#define CGROUP_SUPER_MAGIC 0x27e0eb + +#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA +#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA + +#define STACK_END_MAGIC 0x57AC6E9D +#endif /* __LINUX_MAGIC_H__ */ diff --git a/init/Kconfig b/init/Kconfig index af5842b..cac65e2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -373,6 +373,10 @@ config FUTEX support for "fast userspace mutexes". The resulting kernel may not run glibc-based applications correctly. +config ANON_INODES + bool + default y + config EPOLL bool "Enable eventpoll support" if EMBEDDED default y