From: Eric Sandeen <sandeen@redhat.com> Date: Wed, 25 Mar 2009 15:52:54 -0500 Subject: [fs] xfs: update to 2.6.28.6 codebase Message-id: 49CA99A6.3090403@redhat.com O-Subject: [PATCH 6/10] update fs/xfs to 2.6.28.6 codebase Bugzilla: 470845 RH-Acked-by: Josef Bacik <josef@redhat.com> Update fs/xfs to 2.6.28.6's original codebase. This won't build on rhel w/o the subsequent patches. (attachment this time; big patch) Update fs/xfs to 2.6.28.6's original codebase. This won't build on rhel w/o the subsequent patches. diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild deleted file mode 100644 index 2566e96..0000000 --- a/fs/xfs/Kbuild +++ /dev/null @@ -1,6 +0,0 @@ -# -# The xfs people like to share Makefile with 2.6 and 2.4. -# Utilise file named Kbuild file which has precedence over Makefile. -# - -include $(srctree)/$(obj)/Makefile-linux-2.6 diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 26b364c..3f53dd1 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -1,5 +1,6 @@ config XFS_FS tristate "XFS filesystem support" + depends on BLOCK help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can @@ -34,18 +35,6 @@ config XFS_QUOTA with or without the generic quota support enabled (CONFIG_QUOTA) - they are completely independent subsystems. -config XFS_SECURITY - bool "XFS Security Label support" - depends on XFS_FS - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute namespace for inode security - labels in the XFS filesystem. - - If you are not using a security module that requires using - extended attributes for inode security labels, say N. - config XFS_POSIX_ACL bool "XFS POSIX ACL support" depends on XFS_FS @@ -75,3 +64,16 @@ config XFS_RT See the xfs man page in section 5 for additional information. If unsure, say N. + +config XFS_DEBUG + bool "XFS Debugging support (EXPERIMENTAL)" + depends on XFS_FS && EXPERIMENTAL + help + Say Y here to get an XFS build with many debugging features, + including ASSERT checks, function wrappers around macros, + and extra sanity-checking functions in various code paths. + + Note that the resulting code will be HUGE and SLOW, and probably + not useful unless you are debugging a particular problem. + + Say N unless you are an XFS developer, or you play one on TV. diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 49e3e7e..737c9a4 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -1 +1,118 @@ -include $(TOPDIR)/fs/xfs/Makefile-linux-$(VERSION).$(PATCHLEVEL) +# +# Copyright (c) 2000-2005 Silicon Graphics, Inc. +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# + +EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char + +XFS_LINUX := linux-2.6 + +ifeq ($(CONFIG_XFS_DEBUG),y) + EXTRA_CFLAGS += -g +endif + +obj-$(CONFIG_XFS_FS) += xfs.o + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ + xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o) + +ifeq ($(CONFIG_XFS_QUOTA),y) +xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o +endif + +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o +xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o +xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o + + +xfs-y += xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_leaf.o \ + xfs_bit.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_buf_item.o \ + xfs_da_btree.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_error.o \ + xfs_extfree_item.o \ + xfs_filestream.o \ + xfs_fsops.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_iget.o \ + xfs_inode.o \ + xfs_inode_item.o \ + xfs_iomap.o \ + xfs_itable.o \ + xfs_dfrag.o \ + xfs_log.o \ + xfs_log_recover.o \ + xfs_mount.o \ + xfs_mru_cache.o \ + xfs_rename.o \ + xfs_trans.o \ + xfs_trans_ail.o \ + xfs_trans_buf.o \ + xfs_trans_extfree.o \ + xfs_trans_inode.o \ + xfs_trans_item.o \ + xfs_utils.o \ + xfs_vfsops.o \ + xfs_vnodeops.o \ + xfs_rw.o \ + xfs_dmops.o \ + xfs_qmops.o + +xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o + +# Objects in linux/ +xfs-y += $(addprefix $(XFS_LINUX)/, \ + kmem.o \ + xfs_aops.o \ + xfs_buf.o \ + xfs_export.o \ + xfs_file.o \ + xfs_fs_subr.o \ + xfs_globals.o \ + xfs_ioctl.o \ + xfs_iops.o \ + xfs_lrw.o \ + xfs_super.o \ + xfs_vnode.o \ + xfs_xattr.o) + +# Objects in support/ +xfs-y += $(addprefix support/, \ + debug.o \ + uuid.o) + +xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o + diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6 deleted file mode 100644 index 9e7f859..0000000 --- a/fs/xfs/Makefile-linux-2.6 +++ /dev/null @@ -1,135 +0,0 @@ -# -# Copyright (c) 2000-2005 Silicon Graphics, Inc. -# All Rights Reserved. -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it would be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write the Free Software Foundation, -# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# - -EXTRA_CFLAGS += -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char - -XFS_LINUX := linux-2.6 - -ifeq ($(CONFIG_XFS_DEBUG),y) - EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG - EXTRA_CFLAGS += -DXFS_BUF_LOCK_TRACKING -endif -ifeq ($(CONFIG_XFS_TRACE),y) - EXTRA_CFLAGS += -DXFS_ALLOC_TRACE - EXTRA_CFLAGS += -DXFS_ATTR_TRACE - EXTRA_CFLAGS += -DXFS_BLI_TRACE - EXTRA_CFLAGS += -DXFS_BMAP_TRACE - EXTRA_CFLAGS += -DXFS_BMBT_TRACE - EXTRA_CFLAGS += -DXFS_DIR_TRACE - EXTRA_CFLAGS += -DXFS_DIR2_TRACE - EXTRA_CFLAGS += -DXFS_DQUOT_TRACE - EXTRA_CFLAGS += -DXFS_ILOCK_TRACE - EXTRA_CFLAGS += -DXFS_LOG_TRACE - EXTRA_CFLAGS += -DXFS_RW_TRACE - EXTRA_CFLAGS += -DXFS_BUF_TRACE - EXTRA_CFLAGS += -DXFS_VNODE_TRACE -endif - -obj-$(CONFIG_XFS_FS) += xfs.o - -xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ - xfs_dquot.o \ - xfs_dquot_item.o \ - xfs_trans_dquot.o \ - xfs_qm_syscalls.o \ - xfs_qm_bhv.o \ - xfs_qm.o) - -ifeq ($(CONFIG_XFS_QUOTA),y) -xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o -endif - -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o -xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o -xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o -xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o -xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o - - -xfs-y += xfs_alloc.o \ - xfs_alloc_btree.o \ - xfs_attr.o \ - xfs_attr_leaf.o \ - xfs_behavior.o \ - xfs_bit.o \ - xfs_bmap.o \ - xfs_bmap_btree.o \ - xfs_btree.o \ - xfs_buf_item.o \ - xfs_da_btree.o \ - xfs_dir2.o \ - xfs_dir2_block.o \ - xfs_dir2_data.o \ - xfs_dir2_leaf.o \ - xfs_dir2_node.o \ - xfs_dir2_sf.o \ - xfs_error.o \ - xfs_extfree_item.o \ - xfs_fsops.o \ - xfs_ialloc.o \ - xfs_ialloc_btree.o \ - xfs_iget.o \ - xfs_inode.o \ - xfs_inode_item.o \ - xfs_iocore.o \ - xfs_iomap.o \ - xfs_itable.o \ - xfs_dfrag.o \ - xfs_log.o \ - xfs_log_recover.o \ - xfs_mount.o \ - xfs_rename.o \ - xfs_trans.o \ - xfs_trans_ail.o \ - xfs_trans_buf.o \ - xfs_trans_extfree.o \ - xfs_trans_inode.o \ - xfs_trans_item.o \ - xfs_utils.o \ - xfs_vfsops.o \ - xfs_vnodeops.o \ - xfs_rw.o \ - xfs_dmops.o \ - xfs_qmops.o - -xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o - -# Objects in linux/ -xfs-y += $(addprefix $(XFS_LINUX)/, \ - kmem.o \ - xfs_aops.o \ - xfs_buf.o \ - xfs_export.o \ - xfs_file.o \ - xfs_fs_subr.o \ - xfs_globals.o \ - xfs_ioctl.o \ - xfs_iops.o \ - xfs_lrw.o \ - xfs_super.o \ - xfs_vfs.o \ - xfs_vnode.o) - -# Objects in support/ -xfs-y += $(addprefix support/, \ - debug.o \ - move.o \ - uuid.o) - -xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o - diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 6ba9684..1cd3b55 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -15,12 +15,12 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <linux/sched.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include "time.h" #include "kmem.h" @@ -34,6 +34,14 @@ kmem_alloc(size_t size, unsigned int __nocast flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; +#ifdef DEBUG + if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { + printk(KERN_WARNING "Large %s attempt, size=%ld\n", + __func__, (long)size); + dump_stack(); + } +#endif + do { if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) ptr = kmalloc(size, lflags); @@ -44,8 +52,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags) if (!(++retries % 100)) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, lflags); - blk_congestion_wait(WRITE, HZ/50); + __func__, lflags); + congestion_wait(WRITE, HZ/50); } while (1); } @@ -60,8 +68,29 @@ kmem_zalloc(size_t size, unsigned int __nocast flags) return ptr; } +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, + unsigned int __nocast flags) +{ + void *ptr; + size_t kmsize = maxsize; + unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP; + + while (!(ptr = kmem_zalloc(kmsize, kmflags))) { + if ((kmsize <= minsize) && (flags & KM_NOSLEEP)) + break; + if ((kmsize >>= 1) <= minsize) { + kmsize = minsize; + kmflags = flags; + } + } + if (ptr) + *size = kmsize; + return ptr; +} + void -kmem_free(void *ptr, size_t size) +kmem_free(const void *ptr) { if (!is_vmalloc_addr(ptr)) { kfree(ptr); @@ -71,7 +100,7 @@ kmem_free(void *ptr, size_t size) } void * -kmem_realloc(void *ptr, size_t newsize, size_t oldsize, +kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, unsigned int __nocast flags) { void *new; @@ -81,7 +110,7 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize, if (new) memcpy(new, ptr, ((oldsize < newsize) ? oldsize : newsize)); - kmem_free(ptr, oldsize); + kmem_free(ptr); } return new; } @@ -100,8 +129,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) if (!(++retries % 100)) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, lflags); - blk_congestion_wait(WRITE, HZ/50); + __func__, lflags); + congestion_wait(WRITE, HZ/50); } while (1); } diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 939bd84..af6843c 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -30,6 +30,7 @@ #define KM_NOSLEEP 0x0002u #define KM_NOFS 0x0004u #define KM_MAYFAIL 0x0008u +#define KM_LARGE 0x0010u /* * We use a special process flag to avoid recursive callbacks into @@ -41,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -54,9 +55,10 @@ kmem_flags_convert(unsigned int __nocast flags) } extern void *kmem_alloc(size_t, unsigned int __nocast); -extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void kmem_free(void *, size_t); +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast); +extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); +extern void kmem_free(const void *); /* * Zone interfaces @@ -72,14 +74,14 @@ extern void kmem_free(void *, size_t); static inline kmem_zone_t * kmem_zone_init(int size, char *zone_name) { - return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); + return kmem_cache_create(zone_name, size, 0, 0, NULL); } static inline kmem_zone_t * kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, - void (*construct)(void *, kmem_zone_t *, unsigned long)) + void (*construct)(void *)) { - return kmem_cache_create(zone_name, size, 0, flags, construct, NULL); + return kmem_cache_create(zone_name, size, 0, flags, construct); } static inline void @@ -91,36 +93,17 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr) static inline void kmem_zone_destroy(kmem_zone_t *zone) { - if (zone && kmem_cache_destroy(zone)) - BUG(); + if (zone) + kmem_cache_destroy(zone); } extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); -/* - * Low memory cache shrinkers - */ - -typedef struct shrinker *kmem_shaker_t; -typedef int (*kmem_shake_func_t)(int, gfp_t); - -static inline kmem_shaker_t -kmem_shake_register(kmem_shake_func_t sfunc) -{ - return set_shrinker(DEFAULT_SEEKS, sfunc); -} - -static inline void -kmem_shake_deregister(kmem_shaker_t shrinker) -{ - remove_shrinker(shrinker); -} - static inline int kmem_shake_allow(gfp_t gfp_mask) { - return (gfp_mask & __GFP_WAIT); + return (gfp_mask & __GFP_WAIT) != 0; } #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h index 32e1ce0..ff6a198 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/linux-2.6/mrlock.h @@ -20,29 +20,35 @@ #include <linux/rwsem.h> -enum { MR_NONE, MR_ACCESS, MR_UPDATE }; - typedef struct { struct rw_semaphore mr_lock; +#ifdef DEBUG int mr_writer; +#endif } mrlock_t; +#ifdef DEBUG #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) +#else +#define mrinit(mrp, name) \ + do { init_rwsem(&(mrp)->mr_lock); } while (0) +#endif + #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) #define mrfree(mrp) do { } while (0) -#define mraccess(mrp) mraccessf(mrp, 0) -#define mrupdate(mrp) mrupdatef(mrp, 0) -static inline void mraccessf(mrlock_t *mrp, int flags) +static inline void mraccess_nested(mrlock_t *mrp, int subclass) { - down_read(&mrp->mr_lock); + down_read_nested(&mrp->mr_lock, subclass); } -static inline void mrupdatef(mrlock_t *mrp, int flags) +static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { - down_write(&mrp->mr_lock); + down_write_nested(&mrp->mr_lock, subclass); +#ifdef DEBUG mrp->mr_writer = 1; +#endif } static inline int mrtryaccess(mrlock_t *mrp) @@ -54,39 +60,31 @@ static inline int mrtryupdate(mrlock_t *mrp) { if (!down_write_trylock(&mrp->mr_lock)) return 0; +#ifdef DEBUG mrp->mr_writer = 1; +#endif return 1; } -static inline void mrunlock(mrlock_t *mrp) +static inline void mrunlock_excl(mrlock_t *mrp) { - if (mrp->mr_writer) { - mrp->mr_writer = 0; - up_write(&mrp->mr_lock); - } else { - up_read(&mrp->mr_lock); - } +#ifdef DEBUG + mrp->mr_writer = 0; +#endif + up_write(&mrp->mr_lock); } -static inline void mrdemote(mrlock_t *mrp) +static inline void mrunlock_shared(mrlock_t *mrp) { - mrp->mr_writer = 0; - downgrade_write(&mrp->mr_lock); + up_read(&mrp->mr_lock); } -#ifdef DEBUG -/* - * Debug-only routine, without some platform-specific asm code, we can - * now only answer requests regarding whether we hold the lock for write - * (reader state is outside our visibility, we only track writer state). - * Note: means !ismrlocked would give false positives, so don't do that. - */ -static inline int ismrlocked(mrlock_t *mrp, int type) +static inline void mrdemote(mrlock_t *mrp) { - if (mrp && type == MR_UPDATE) - return mrp->mr_writer; - return 1; -} +#ifdef DEBUG + mrp->mr_writer = 0; #endif + downgrade_write(&mrp->mr_lock); +} #endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h deleted file mode 100644 index b250900..0000000 --- a/fs/xfs/linux-2.6/sema.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SEMA_H__ -#define __XFS_SUPPORT_SEMA_H__ - -#include <linux/time.h> -#include <linux/wait.h> -#include <asm/atomic.h> -#include <asm/semaphore.h> - -/* - * sema_t structure just maps to struct semaphore in Linux kernel. - */ - -typedef struct semaphore sema_t; - -#define init_sema(sp, val, c, d) sema_init(sp, val) -#define initsema(sp, val) sema_init(sp, val) -#define initnsema(sp, val, name) sema_init(sp, val) -#define psema(sp, b) down(sp) -#define vsema(sp) up(sp) -#define freesema(sema) do { } while (0) - -static inline int issemalocked(sema_t *sp) -{ - return down_trylock(sp) || (up(sp), 0); -} - -/* - * Map cpsema (try to get the sema) to down_trylock. We need to switch - * the return values since cpsema returns 1 (acquired) 0 (failed) and - * down_trylock returns the reverse 0 (acquired) 1 (failed). - */ -static inline int cpsema(sema_t *sp) -{ - return down_trylock(sp) ? 0 : 1; -} - -#endif /* __XFS_SUPPORT_SEMA_H__ */ diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h deleted file mode 100644 index 50a6191..0000000 --- a/fs/xfs/linux-2.6/spin.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SPIN_H__ -#define __XFS_SUPPORT_SPIN_H__ - -#include <linux/sched.h> /* preempt needs this */ -#include <linux/spinlock.h> - -/* - * Map lock_t from IRIX to Linux spinlocks. - * - * We do not make use of lock_t from interrupt context, so we do not - * have to worry about disabling interrupts at all (unlike IRIX). - */ - -typedef spinlock_t lock_t; - -#define SPLDECL(s) unsigned long s -#ifndef DEFINE_SPINLOCK -#define DEFINE_SPINLOCK(s) spinlock_t s = SPIN_LOCK_UNLOCKED -#endif - -#define spinlock_init(lock, name) spin_lock_init(lock) -#define spinlock_destroy(lock) -#define mutex_spinlock(lock) ({ spin_lock(lock); 0; }) -#define mutex_spinunlock(lock, s) do { spin_unlock(lock); (void)s; } while (0) -#define nested_spinlock(lock) spin_lock(lock) -#define nested_spinunlock(lock) spin_unlock(lock) - -#endif /* __XFS_SUPPORT_SPIN_H__ */ diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h index 9a8ad48..351a8f4 100644 --- a/fs/xfs/linux-2.6/sv.h +++ b/fs/xfs/linux-2.6/sv.h @@ -53,8 +53,6 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, remove_wait_queue(&sv->waiters, &wait); } -#define init_sv(sv,type,name,flag) \ - init_waitqueue_head(&(sv)->waiters) #define sv_init(sv,flag,name) \ init_waitqueue_head(&(sv)->waiters) #define sv_destroy(sv) \ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index f1c1a88..a44d68e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -37,6 +37,7 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_iomap.h" +#include "xfs_vnodeops.h" #include <linux/mpage.h> #include <linux/pagevec.h> #include <linux/writeback.h> @@ -56,8 +57,6 @@ xfs_count_page_state( do { if (buffer_uptodate(bh) && !buffer_mapped(bh)) (*unmapped) = 1; - else if (buffer_unwritten(bh) && !buffer_delay(bh)) - clear_buffer_unwritten(bh); else if (buffer_unwritten(bh)) (*unwritten) = 1; else if (buffer_delay(bh)) @@ -71,10 +70,9 @@ xfs_page_trace( int tag, struct inode *inode, struct page *page, - int mask) + unsigned long pgoff) { xfs_inode_t *ip; - bhv_vnode_t *vp = vn_from_inode(inode); loff_t isize = i_size_read(inode); loff_t offset = page_offset(page); int delalloc = -1, unmapped = -1, unwritten = -1; @@ -82,7 +80,7 @@ xfs_page_trace( if (page_has_buffers(page)) xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - ip = xfs_vtoi(vp); + ip = XFS_I(inode); if (!ip->i_rwtrace) return; @@ -91,7 +89,7 @@ xfs_page_trace( (void *)ip, (void *)inode, (void *)page, - (void *)((unsigned long)mask), + (void *)pgoff, (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), (void *)((unsigned long)((isize >> 32) & 0xffffffff)), @@ -105,19 +103,36 @@ xfs_page_trace( (void *)NULL); } #else -#define xfs_page_trace(tag, inode, page, mask) +#define xfs_page_trace(tag, inode, page, pgoff) #endif +STATIC struct block_device * +xfs_find_bdev_for_inode( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_bdev; + else + return mp->m_ddev_targp->bt_bdev; +} + /* * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. */ STATIC void xfs_finish_ioend( - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int wait) { - if (atomic_dec_and_test(&ioend->io_remaining)) + if (atomic_dec_and_test(&ioend->io_remaining)) { queue_work(xfsdatad_workqueue, &ioend->io_work); + if (wait) + flush_workqueue(xfsdatad_workqueue); + } } /* @@ -136,23 +151,63 @@ xfs_destroy_ioend( next = bh->b_private; bh->b_end_io(bh, !ioend->io_error); } - if (unlikely(ioend->io_error)) - vn_ioerror(ioend->io_vnode, ioend->io_error, __FILE__,__LINE__); - vn_iowake(ioend->io_vnode); + if (unlikely(ioend->io_error)) { + vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error, + __FILE__,__LINE__); + } + vn_iowake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } /* + * Update on-disk file size now that data has been written to disk. + * The current in-memory file size is i_size. If a write is beyond + * eof i_new_size will be the intended file size until i_size is + * updated. If this write does not extend all the way to the valid + * file size then restrict this update to the end of the write. + */ +STATIC void +xfs_setfilesize( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + ASSERT(ioend->io_type != IOMAP_READ); + + if (unlikely(ioend->io_error)) + return; + + bsize = ioend->io_offset + ioend->io_size; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + + if (ip->i_d.di_size < isize) { + ip->i_d.di_size = isize; + ip->i_update_core = 1; + ip->i_update_size = 1; + mark_inode_dirty_sync(ioend->io_inode); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + +/* * Buffered IO write completion for delayed allocate extents. - * TODO: Update ondisk isize now that we know the file data - * has been flushed (i.e. the notorious "NULL file" problem). */ STATIC void xfs_end_bio_delalloc( - void *data) + struct work_struct *work) { - xfs_ioend_t *ioend = data; + xfs_ioend_t *ioend = + container_of(work, xfs_ioend_t, io_work); + xfs_setfilesize(ioend); xfs_destroy_ioend(ioend); } @@ -161,10 +216,12 @@ xfs_end_bio_delalloc( */ STATIC void xfs_end_bio_written( - void *data) + struct work_struct *work) { - xfs_ioend_t *ioend = data; + xfs_ioend_t *ioend = + container_of(work, xfs_ioend_t, io_work); + xfs_setfilesize(ioend); xfs_destroy_ioend(ioend); } @@ -176,15 +233,36 @@ xfs_end_bio_written( */ STATIC void xfs_end_bio_unwritten( - void *data) + struct work_struct *work) { - xfs_ioend_t *ioend = data; - bhv_vnode_t *vp = ioend->io_vnode; + xfs_ioend_t *ioend = + container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - if (likely(!ioend->io_error)) - bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL); + if (likely(!ioend->io_error)) { + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + int error; + error = xfs_iomap_write_unwritten(ip, offset, size); + if (error) + ioend->io_error = error; + } + xfs_setfilesize(ioend); + } + xfs_destroy_ioend(ioend); +} + +/* + * IO read completion for regular, written extents. + */ +STATIC void +xfs_end_bio_read( + struct work_struct *work) +{ + xfs_ioend_t *ioend = + container_of(work, xfs_ioend_t, io_work); + xfs_destroy_ioend(ioend); } @@ -212,19 +290,21 @@ xfs_alloc_ioend( ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; - ioend->io_vnode = vn_from_inode(inode); + ioend->io_inode = inode; ioend->io_buffer_head = NULL; ioend->io_buffer_tail = NULL; - atomic_inc(&ioend->io_vnode->v_iocount); + atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; if (type == IOMAP_UNWRITTEN) - INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); + INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten); else if (type == IOMAP_DELAY) - INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend); + INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc); + else if (type == IOMAP_READ) + INIT_WORK(&ioend->io_work, xfs_end_bio_read); else - INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend); + INIT_WORK(&ioend->io_work, xfs_end_bio_written); return ioend; } @@ -237,16 +317,17 @@ xfs_map_blocks( xfs_iomap_t *mapp, int flags) { - bhv_vnode_t *vp = vn_from_inode(inode); + xfs_inode_t *ip = XFS_I(inode); int error, nmaps = 1; - error = bhv_vop_bmap(vp, offset, count, flags, mapp, &nmaps); + error = xfs_iomap(ip, offset, count, + flags, mapp, &nmaps); if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE))) - VMODIFY(vp); + xfs_iflags_set(ip, XFS_IMODIFIED); return -error; } -STATIC inline int +STATIC_INLINE int xfs_iomap_valid( xfs_iomap_t *iomapp, loff_t offset) @@ -258,17 +339,13 @@ xfs_iomap_valid( /* * BIO completion handler for buffered IO. */ -STATIC int +STATIC void xfs_end_bio( struct bio *bio, - unsigned int bytes_done, int error) { xfs_ioend_t *ioend = bio->bi_private; - if (bio->bi_size) - return 1; - ASSERT(atomic_read(&bio->bi_cnt) >= 1); ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; @@ -277,8 +354,7 @@ xfs_end_bio( bio->bi_end_io = NULL; bio_put(bio); - xfs_finish_ioend(ioend); - return 0; + xfs_finish_ioend(ioend, 0); } STATIC void @@ -332,20 +408,18 @@ xfs_start_buffer_writeback( STATIC void xfs_start_page_writeback( struct page *page, - struct writeback_control *wbc, int clear_dirty, int buffers) { ASSERT(PageLocked(page)); ASSERT(!PageWriteback(page)); - set_page_writeback(page); if (clear_dirty) - clear_page_dirty(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); unlock_page(page); - if (!buffers) { + /* If no buffers on the page are to be written, finish it here */ + if (!buffers) end_page_writeback(page); - wbc->pages_skipped++; /* We didn't write this page */ - } } static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) @@ -413,7 +487,7 @@ xfs_submit_ioend( } if (bio) xfs_submit_ioend_bio(ioend, bio); - xfs_finish_ioend(ioend); + xfs_finish_ioend(ioend, 0); } while ((ioend = next) != NULL); } @@ -438,7 +512,7 @@ xfs_cancel_ioend( unlock_buffer(bh); } while ((bh = next_bh) != NULL); - vn_iowake(ioend->io_vnode); + vn_iowake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); } @@ -588,7 +662,7 @@ xfs_probe_cluster( for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - size_t pg_offset, len = 0; + size_t pg_offset, pg_len = 0; if (tindex == tlast) { pg_offset = @@ -600,17 +674,17 @@ xfs_probe_cluster( } else pg_offset = PAGE_CACHE_SIZE; - if (page->index == tindex && !TestSetPageLocked(page)) { - len = xfs_probe_page(page, pg_offset, mapped); + if (page->index == tindex && trylock_page(page)) { + pg_len = xfs_probe_page(page, pg_offset, mapped); unlock_page(page); } - if (!len) { + if (!pg_len) { done = 1; break; } - total += len; + total += pg_len; tindex++; } @@ -644,7 +718,7 @@ xfs_is_delayed_page( else if (buffer_delay(bh)) acceptable = (type == IOMAP_DELAY); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == 0); + acceptable = (type == IOMAP_NEW); else break; } while ((bh = bh->b_this_page) != head); @@ -684,7 +758,7 @@ xfs_convert_page( if (page->index != tindex) goto fail; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto fail; if (PageWriteback(page)) goto fail_unlock_page; @@ -753,7 +827,7 @@ xfs_convert_page( page_dirty--; count++; } else { - type = 0; + type = IOMAP_NEW; if (buffer_mapped(bh) && all_bh && startio) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, @@ -782,7 +856,7 @@ xfs_convert_page( done = 1; } } - xfs_start_page_writeback(page, wbc, !page_dirty, count); + xfs_start_page_writeback(page, !page_dirty, count); } return done; @@ -911,8 +985,8 @@ xfs_page_state_convert( bh = head = page_buffers(page); offset = page_offset(page); - flags = -1; - type = 0; + flags = BMAPI_READ; + type = IOMAP_NEW; /* TODO: cleanup count and page_dirty */ @@ -942,14 +1016,16 @@ xfs_page_state_convert( * * Third case, an unmapped buffer was found, and we are * in a path where we need to write the whole page out. - */ + */ if (buffer_unwritten(bh) || buffer_delay(bh) || ((buffer_uptodate(bh) || PageUptodate(page)) && !buffer_mapped(bh) && (unmapped || startio))) { - /* + int new_ioend = 0; + + /* * Make sure we don't use a read-only iomap */ - if (flags == BMAPI_READ) + if (flags == BMAPI_READ) iomap_valid = 0; if (buffer_unwritten(bh)) { @@ -964,6 +1040,15 @@ xfs_page_state_convert( } if (!iomap_valid) { + /* + * if we didn't have a valid mapping then we + * need to ensure that we put the new mapping + * in a new ioend structure. This needs to be + * done to ensure that the ioends correctly + * reflect the block mappings at io completion + * for unwritten extent conversion. + */ + new_ioend = 1; if (type == IOMAP_NEW) { size = xfs_probe_cluster(inode, page, bh, head, 0); @@ -983,7 +1068,7 @@ xfs_page_state_convert( if (startio) { xfs_add_to_ioend(inode, bh, offset, type, &ioend, - !iomap_valid); + new_ioend); } else { set_buffer_dirty(bh); unlock_buffer(bh); @@ -998,7 +1083,7 @@ xfs_page_state_convert( * That means it must already have extents allocated * underneath it. Map the extent by reading it. */ - if (!iomap_valid || type != 0) { + if (!iomap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; size = xfs_probe_cluster(inode, page, bh, head, 1); @@ -1009,8 +1094,16 @@ xfs_page_state_convert( iomap_valid = xfs_iomap_valid(&iomap, offset); } - type = 0; - if (!test_and_set_bit(BH_Lock, &bh->b_state)) { + /* + * We set the type to IOMAP_NEW in case we are doing a + * small write at EOF that is extending the file but + * without needing an allocation. We need to update the + * file size on I/O completion in this case so it is + * the same case as having just allocated a new extent + * that we are writing into for the first time. + */ + type = IOMAP_NEW; + if (trylock_buffer(bh)) { ASSERT(buffer_mapped(bh)); if (iomap_valid) all_bh = 1; @@ -1035,7 +1128,7 @@ xfs_page_state_convert( SetPageUptodate(page); if (startio) - xfs_start_page_writeback(page, wbc, 1, count); + xfs_start_page_writeback(page, 1, count); if (ioend && iomap_valid) { offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> @@ -1159,10 +1252,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct bhv_vnode *vp = vn_from_inode(mapping->host); - - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return generic_writepages(mapping, wbc); } @@ -1197,7 +1287,7 @@ xfs_vm_releasepage( .nr_to_write = 1, }; - xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask); + xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0); if (!page_has_buffers(page)) return 0; @@ -1239,7 +1329,6 @@ __xfs_get_blocks( int direct, bmapi_flags_t flags) { - bhv_vnode_t *vp = vn_from_inode(inode); xfs_iomap_t iomap; xfs_off_t offset; ssize_t size; @@ -1249,7 +1338,11 @@ __xfs_get_blocks( offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - error = bhv_vop_bmap(vp, offset, size, + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + + error = xfs_iomap(XFS_I(inode), offset, size, create ? flags : BMAPI_READ, &iomap, &niomap); if (error) return -error; @@ -1269,7 +1362,6 @@ __xfs_get_blocks( if (direct) bh_result->b_private = inode; set_buffer_unwritten(bh_result); - set_buffer_delay(bh_result); } } @@ -1280,13 +1372,18 @@ __xfs_get_blocks( bh_result->b_bdev = iomap.iomap_target->bt_bdev; /* - * If we previously allocated a block out beyond eof and we are - * now coming back to use it then we will need to flag it as new - * even if it has a disk address. + * If we previously allocated a block out beyond eof and we are now + * coming back to use it then we will need to flag it as new even if it + * has a disk address. + * + * With sub-block writes into unwritten extents we also need to mark + * the buffer as new so that the unwritten parts of the buffer gets + * correctly zeroed. */ if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) + (offset >= i_size_read(inode)) || + (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) set_buffer_new(bh_result); if (iomap.iomap_flags & IOMAP_DELAY) { @@ -1347,17 +1444,32 @@ xfs_end_io_direct( * This is not necessary for synchronous direct I/O, but we do * it anyway to keep the code uniform and simpler. * + * Well, if only it were that simple. Because synchronous direct I/O + * requires extent conversion to occur *before* we return to userspace, + * we have to wait for extent conversion to complete. Look at the + * iocb that has been passed to us to determine if this is AIO or + * not. If it is synchronous, tell xfs_finish_ioend() to kick the + * workqueue and wait for it to complete. + * * The core direct I/O code might be changed to always call the * completion handler in the future, in which case all this can * go away. */ - if (private && size > 0) { - ioend->io_offset = offset; - ioend->io_size = size; - xfs_finish_ioend(ioend); + ioend->io_offset = offset; + ioend->io_size = size; + if (ioend->io_type == IOMAP_READ) { + xfs_finish_ioend(ioend, 0); + } else if (private && size > 0) { + xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); } else { - ASSERT(size >= 0); - xfs_destroy_ioend(ioend); + /* + * A direct I/O write ioend starts it's life in unwritten + * state in case they map an unwritten extent. This write + * didn't map an unwritten extent so switch it's completion + * handler. + */ + INIT_WORK(&ioend->io_work, xfs_end_bio_written); + xfs_finish_ioend(ioend, 0); } /* @@ -1378,28 +1490,21 @@ xfs_vm_direct_IO( { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - xfs_iomap_t iomap; - int maps = 1; - int error; + struct block_device *bdev; ssize_t ret; - error = bhv_vop_bmap(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps); - if (error) - return -error; - - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); + bdev = xfs_find_bdev_for_inode(XFS_I(inode)); if (rw == WRITE) { + iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); ret = blockdev_direct_IO_own_locking(rw, iocb, inode, - iomap.iomap_target->bt_bdev, - iov, offset, nr_segs, + bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct); } else { + iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, - iomap.iomap_target->bt_bdev, - iov, offset, nr_segs, + bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct); } @@ -1410,13 +1515,18 @@ xfs_vm_direct_IO( } STATIC int -xfs_vm_prepare_write( +xfs_vm_write_begin( struct file *file, - struct page *page, - unsigned int from, - unsigned int to) + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) { - return block_prepare_write(page, from, to, xfs_get_blocks); + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + xfs_get_blocks); } STATIC sector_t @@ -1425,12 +1535,12 @@ xfs_vm_bmap( sector_t block) { struct inode *inode = (struct inode *)mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); + struct xfs_inode *ip = XFS_I(inode); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - bhv_vop_rwlock(vp, VRWLOCK_READ); - bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF); - bhv_vop_rwunlock(vp, VRWLOCK_READ); + xfs_itrace_entry(XFS_I(inode)); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } @@ -1470,8 +1580,8 @@ const struct address_space_operations xfs_address_space_operations = { .sync_page = block_sync_page, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .prepare_write = xfs_vm_prepare_write, - .commit_write = generic_commit_write, + .write_begin = xfs_vm_write_begin, + .write_end = generic_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 2244e51..3ba0631 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -32,7 +32,7 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ - struct bhv_vnode *io_vnode; /* file being written to */ + struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 38d0dfd..8454dee 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "xfs.h" #include <linux/stddef.h> #include <linux/errno.h> #include <linux/slab.h> @@ -30,15 +31,19 @@ #include <linux/hash.h> #include <linux/kthread.h> #include <linux/migrate.h> -#include "xfs_linux.h" +#include <linux/backing-dev.h> +#include <linux/freezer.h> -STATIC kmem_zone_t *xfs_buf_zone; -STATIC kmem_shaker_t xfs_buf_shake; +static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); +static struct shrinker xfs_buf_shake = { + .shrink = xfsbufd_wakeup, + .seeks = DEFAULT_SEEKS, +}; -STATIC struct workqueue_struct *xfslogd_workqueue; +static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; #ifdef XFS_BUF_TRACE @@ -53,7 +58,7 @@ xfs_buf_trace( bp, id, (void *)(unsigned long)bp->b_flags, (void *)(unsigned long)bp->b_hold.counter, - (void *)(unsigned long)bp->b_sema.count.counter, + (void *)(unsigned long)bp->b_sema.count, (void *)current, data, ra, (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), @@ -137,7 +142,7 @@ page_region_mask( return mask; } -STATIC inline void +STATIC_INLINE void set_page_region( struct page *page, size_t offset, @@ -149,7 +154,7 @@ set_page_region( SetPageUptodate(page); } -STATIC inline int +STATIC_INLINE int test_page_region( struct page *page, size_t offset, @@ -169,9 +174,9 @@ typedef struct a_list { struct a_list *next; } a_list_t; -STATIC a_list_t *as_free_head; -STATIC int as_list_len; -STATIC DEFINE_SPINLOCK(as_lock); +static a_list_t *as_free_head; +static int as_list_len; +static DEFINE_SPINLOCK(as_lock); /* * Try to batch vunmaps because they are costly. @@ -182,6 +187,19 @@ free_address( { a_list_t *aentry; +#ifdef CONFIG_XEN + /* + * Xen needs to be able to make sure it can get an exclusive + * RO mapping of pages it wants to turn into a pagetable. If + * a newly allocated page is also still being vmap()ed by xfs, + * it will cause pagetable construction to fail. This is a + * quick workaround to always eagerly unmap pages so that Xen + * is happy. + */ + vunmap(addr); + return; +#endif + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); if (likely(aentry)) { spin_lock(&as_lock); @@ -235,7 +253,7 @@ _xfs_buf_initialize( memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); - init_MUTEX_LOCKED(&bp->b_iodonesema); + init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_hash_list); init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ @@ -292,8 +310,7 @@ _xfs_buf_free_pages( xfs_buf_t *bp) { if (bp->b_pages != bp->b_page_array) { - kmem_free(bp->b_pages, - bp->b_page_count * sizeof(struct page *)); + kmem_free(bp->b_pages); } } @@ -312,22 +329,19 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_hash_list)); - if (bp->b_flags & _XBF_PAGE_CACHE) { + if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) free_address(bp->b_addr - bp->b_offset); - for (i = 0; i < bp->b_page_count; i++) - page_cache_release(bp->b_pages[i]); - _xfs_buf_free_pages(bp); - } else if (bp->b_flags & _XBF_KMEM_ALLOC) { - /* - * XXX(hch): bp->b_count_desired might be incorrect (see - * xfs_buf_associate_memory for details), but fortunately - * the Linux version of kmem_free ignores the len argument.. - */ - kmem_free(bp->b_addr, bp->b_count_desired); + for (i = 0; i < bp->b_page_count; i++) { + struct page *page = bp->b_pages[i]; + + if (bp->b_flags & _XBF_PAGE_CACHE) + ASSERT(!PagePrivate(page)); + page_cache_release(page); + } _xfs_buf_free_pages(bp); } @@ -387,11 +401,11 @@ _xfs_buf_lookup_pages( printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, gfp_mask); + __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); xfsbufd_wakeup(0, gfp_mask); - blk_congestion_wait(WRITE, HZ/50); + congestion_wait(WRITE, HZ/50); goto retry; } @@ -400,11 +414,12 @@ _xfs_buf_lookup_pages( nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); size -= nbytes; + ASSERT(!PagePrivate(page)); if (!PageUptodate(page)) { page_count--; if (blocksize >= PAGE_CACHE_SIZE) { if (flags & XBF_READ) - bp->b_locked = 1; + bp->b_flags |= _XBF_PAGE_LOCKED; } else if (!PagePrivate(page)) { if (test_page_region(page, offset, nbytes)) page_count++; @@ -415,7 +430,7 @@ _xfs_buf_lookup_pages( offset = 0; } - if (!bp->b_locked) { + if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { for (i = 0; i < bp->b_page_count; i++) unlock_page(bp->b_pages[i]); } @@ -591,7 +606,7 @@ xfs_buf_get_flags( error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { printk(KERN_WARNING "%s: failed to map pages\n", - __FUNCTION__); + __func__); goto no_buffer; } } @@ -708,15 +723,15 @@ xfs_buf_associate_memory( { int rval; int i = 0; - size_t ptr; - size_t end, end_cur; - off_t offset; + unsigned long pageaddr; + unsigned long offset; + size_t buflen; int page_count; - page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; - offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); - if (offset && (len > PAGE_CACHE_SIZE)) - page_count++; + pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; + offset = (unsigned long)mem - pageaddr; + buflen = PAGE_CACHE_ALIGN(len + offset); + page_count = buflen >> PAGE_CACHE_SHIFT; /* Free any previous set of page pointers */ if (bp->b_pages) @@ -730,23 +745,16 @@ xfs_buf_associate_memory( return rval; bp->b_offset = offset; - ptr = (size_t) mem & PAGE_CACHE_MASK; - end = PAGE_CACHE_ALIGN((size_t) mem + len); - end_cur = end; - /* set up first page */ - bp->b_pages[0] = mem_to_page(mem); - - ptr += PAGE_CACHE_SIZE; - bp->b_page_count = ++i; - while (ptr < end) { - bp->b_pages[i] = mem_to_page((void *)ptr); - bp->b_page_count = ++i; - ptr += PAGE_CACHE_SIZE; + + for (i = 0; i < bp->b_page_count; i++) { + bp->b_pages[i] = mem_to_page((void *)pageaddr); + pageaddr += PAGE_CACHE_SIZE; } - bp->b_locked = 0; - bp->b_count_desired = bp->b_buffer_length = len; + bp->b_count_desired = len; + bp->b_buffer_length = buflen; bp->b_flags |= XBF_MAPPED; + bp->b_flags &= ~_XBF_PAGE_LOCKED; return 0; } @@ -756,43 +764,44 @@ xfs_buf_get_noaddr( size_t len, xfs_buftarg_t *target) { - size_t malloc_len = len; + unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + int error, i; xfs_buf_t *bp; - void *data; - int error; bp = xfs_buf_allocate(0); if (unlikely(bp == NULL)) goto fail; _xfs_buf_initialize(bp, target, 0, len, 0); - try_again: - data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); - if (unlikely(data == NULL)) + error = _xfs_buf_get_pages(bp, page_count, 0); + if (error) goto fail_free_buf; - /* check whether alignment matches.. */ - if ((__psunsigned_t)data != - ((__psunsigned_t)data & ~target->bt_smask)) { - /* .. else double the size and try again */ - kmem_free(data, malloc_len); - malloc_len <<= 1; - goto try_again; + for (i = 0; i < page_count; i++) { + bp->b_pages[i] = alloc_page(GFP_KERNEL); + if (!bp->b_pages[i]) + goto fail_free_mem; } + bp->b_flags |= _XBF_PAGES; - error = xfs_buf_associate_memory(bp, data, len); - if (error) + error = _xfs_buf_map_pages(bp, XBF_MAPPED); + if (unlikely(error)) { + printk(KERN_WARNING "%s: failed to map pages\n", + __func__); goto fail_free_mem; - bp->b_flags |= _XBF_KMEM_ALLOC; + } xfs_buf_unlock(bp); - XB_TRACE(bp, "no_daddr", data); + XB_TRACE(bp, "no_daddr", len); return bp; + fail_free_mem: - kmem_free(data, malloc_len); + while (--i >= 0) + __free_page(bp->b_pages[i]); + _xfs_buf_free_pages(bp); fail_free_buf: - xfs_buf_free(bp); + xfs_buf_deallocate(bp); fail: return NULL; } @@ -829,6 +838,7 @@ xfs_buf_rele( return; } + ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); @@ -842,11 +852,6 @@ xfs_buf_rele( spin_unlock(&hash->bh_lock); xfs_buf_free(bp); } - } else { - /* - * Catch reference count leaks - */ - ASSERT(atomic_read(&bp->b_hold) >= 0); } } @@ -886,7 +891,7 @@ int xfs_buf_lock_value( xfs_buf_t *bp) { - return atomic_read(&bp->b_sema.count); + return bp->b_sema.count; } #endif @@ -987,11 +992,24 @@ xfs_buf_wait_unpin( STATIC void xfs_buf_iodone_work( - void *v) + struct work_struct *work) { - xfs_buf_t *bp = (xfs_buf_t *)v; + xfs_buf_t *bp = + container_of(work, xfs_buf_t, b_iodone_work); - if (bp->b_iodone) + /* + * We can get an EOPNOTSUPP to ordered writes. Here we clear the + * ordered flag and reissue them. Because we can't tell the higher + * layers directly that they should not issue ordered I/O anymore, they + * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion. + */ + if ((bp->b_error == EOPNOTSUPP) && + (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { + XB_TRACE(bp, "ordered_retry", bp->b_iodone); + bp->b_flags &= ~XBF_ORDERED; + bp->b_flags |= _XFS_BARRIER_FAILED; + xfs_buf_iorequest(bp); + } else if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); @@ -1002,7 +1020,7 @@ xfs_buf_ioend( xfs_buf_t *bp, int schedule) { - bp->b_flags &= ~(XBF_READ | XBF_WRITE); + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; @@ -1010,13 +1028,13 @@ xfs_buf_ioend( if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { - INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp); + INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); queue_work(xfslogd_workqueue, &bp->b_iodone_work); } else { - xfs_buf_iodone_work(bp); + xfs_buf_iodone_work(&bp->b_iodone_work); } } else { - up(&bp->b_iodonesema); + complete(&bp->b_iowait); } } @@ -1048,7 +1066,7 @@ xfs_buf_iostart( bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); xfs_buf_delwri_queue(bp, 1); - return status; + return 0; } bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ @@ -1076,50 +1094,35 @@ xfs_buf_iostart( return status; } -STATIC __inline__ int -_xfs_buf_iolocked( - xfs_buf_t *bp) -{ - ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE)); - if (bp->b_flags & XBF_READ) - return bp->b_locked; - return 0; -} - -STATIC __inline__ void +STATIC_INLINE void _xfs_buf_ioend( xfs_buf_t *bp, int schedule) { if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - bp->b_locked = 0; + bp->b_flags &= ~_XBF_PAGE_LOCKED; xfs_buf_ioend(bp, schedule); } } -STATIC int +STATIC void xfs_buf_bio_end_io( struct bio *bio, - unsigned int bytes_done, int error) { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; unsigned int blocksize = bp->b_target->bt_bsize; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - if (bio->bi_size) - return 1; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - bp->b_error = EIO; + xfs_buf_ioerror(bp, -error); do { struct page *page = bvec->bv_page; + ASSERT(!PagePrivate(page)); if (unlikely(bp->b_error)) { if (bp->b_flags & XBF_READ) ClearPageUptodate(page); - SetPageError(page); } else if (blocksize >= PAGE_CACHE_SIZE) { SetPageUptodate(page); } else if (!PagePrivate(page) && @@ -1130,41 +1133,38 @@ xfs_buf_bio_end_io( if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - if (_xfs_buf_iolocked(bp)) { + if (bp->b_flags & _XBF_PAGE_LOCKED) unlock_page(page); - } } while (bvec >= bio->bi_io_vec); _xfs_buf_ioend(bp, 1); bio_put(bio); - return 0; } STATIC void _xfs_buf_ioapply( xfs_buf_t *bp) { - int i, rw, map_i, total_nr_pages, nr_pages; + int rw, map_i, total_nr_pages, nr_pages; struct bio *bio; int offset = bp->b_offset; int size = bp->b_count_desired; sector_t sector = bp->b_bn; unsigned int blocksize = bp->b_target->bt_bsize; - int locking = _xfs_buf_iolocked(bp); total_nr_pages = bp->b_page_count; map_i = 0; - if (bp->b_flags & _XBF_RUN_QUEUES) { - bp->b_flags &= ~_XBF_RUN_QUEUES; - rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC; - } else { - rw = (bp->b_flags & XBF_READ) ? READ : WRITE; - } - if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); rw = WRITE_BARRIER; + } else if (bp->b_flags & _XBF_RUN_QUEUES) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; + } else { + rw = (bp->b_flags & XBF_WRITE) ? WRITE : + (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; } /* Special code path for reading a sub page size buffer in -- @@ -1173,7 +1173,8 @@ _xfs_buf_ioapply( * filesystem block size is not smaller than the page size. */ if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && - (bp->b_flags & XBF_READ) && locking && + ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == + (XBF_READ|_XBF_PAGE_LOCKED)) && (blocksize >= PAGE_CACHE_SIZE)) { bio = bio_alloc(GFP_NOIO, 1); @@ -1190,24 +1191,6 @@ _xfs_buf_ioapply( goto submit_io; } - /* Lock down the pages which we need to for the request */ - if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) { - for (i = 0; size; i++) { - int nbytes = PAGE_CACHE_SIZE - offset; - struct page *page = bp->b_pages[i]; - - if (nbytes > size) - nbytes = size; - - lock_page(page); - - size -= nbytes; - offset = 0; - } - offset = bp->b_offset; - size = bp->b_count_desired; - } - next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); @@ -1288,7 +1271,7 @@ xfs_buf_iowait( XB_TRACE(bp, "iowait", 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); - down(&bp->b_iodonesema); + wait_for_completion(&bp->b_iowait); XB_TRACE(bp, "iowaited", (long)bp->b_error); return bp->b_error; } @@ -1399,7 +1382,7 @@ xfs_alloc_bufhash( btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * - sizeof(xfs_bufhash_t), KM_SLEEP); + sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); for (i = 0; i < (1 << btp->bt_hashshift); i++) { spin_lock_init(&btp->bt_hash[i].bh_lock); INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); @@ -1410,15 +1393,15 @@ STATIC void xfs_free_bufhash( xfs_buftarg_t *btp) { - kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t)); + kmem_free(btp->bt_hash); btp->bt_hash = NULL; } /* * buftarg list for delwrite queue processing */ -STATIC LIST_HEAD(xfs_buftarg_list); -STATIC DEFINE_SPINLOCK(xfs_buftarg_lock); +static LIST_HEAD(xfs_buftarg_list); +static DEFINE_SPINLOCK(xfs_buftarg_lock); STATIC void xfs_register_buftarg( @@ -1440,12 +1423,10 @@ xfs_unregister_buftarg( void xfs_free_buftarg( - xfs_buftarg_t *btp, - int external) + xfs_buftarg_t *btp) { xfs_flush_buftarg(btp, 1); - if (external) - xfs_blkdev_put(btp->bt_bdev); + xfs_blkdev_issue_flush(btp); xfs_free_bufhash(btp); iput(btp->bt_mapping->host); @@ -1455,7 +1436,7 @@ xfs_free_buftarg( xfs_unregister_buftarg(btp); kthread_stop(btp->bt_task); - kmem_free(btp, sizeof(*btp)); + kmem_free(btp); } STATIC int @@ -1553,7 +1534,7 @@ xfs_alloc_delwrite_queue( INIT_LIST_HEAD(&btp->bt_list); INIT_LIST_HEAD(&btp->bt_delwrite_queue); - spinlock_init(&btp->bt_delwrite_lock, "delwri_lock"); + spin_lock_init(&btp->bt_delwrite_lock); btp->bt_flags = 0; btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); if (IS_ERR(btp->bt_task)) { @@ -1586,7 +1567,7 @@ xfs_alloc_buftarg( return btp; error: - kmem_free(btp, sizeof(*btp)); + kmem_free(btp); return NULL; } @@ -1670,20 +1651,62 @@ xfsbufd_wakeup( return 0; } +/* + * Move as many buffers as specified to the supplied list + * idicating if we skipped any buffers to prevent deadlocks. + */ +STATIC int +xfs_buf_delwri_split( + xfs_buftarg_t *target, + struct list_head *list, + unsigned long age) +{ + xfs_buf_t *bp, *n; + struct list_head *dwq = &target->bt_delwrite_queue; + spinlock_t *dwlk = &target->bt_delwrite_lock; + int skipped = 0; + int force; + + force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); + INIT_LIST_HEAD(list); + spin_lock(dwlk); + list_for_each_entry_safe(bp, n, dwq, b_list) { + XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); + ASSERT(bp->b_flags & XBF_DELWRI); + + if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { + if (!force && + time_before(jiffies, bp->b_queuetime + age)) { + xfs_buf_unlock(bp); + break; + } + + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| + _XBF_RUN_QUEUES); + bp->b_flags |= XBF_WRITE; + list_move_tail(&bp->b_list, list); + } else + skipped++; + } + spin_unlock(dwlk); + + return skipped; + +} + STATIC int xfsbufd( - void *data) + void *data) { - struct list_head tmp; - unsigned long age; - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - xfs_buf_t *bp, *n; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; + struct list_head tmp; + xfs_buftarg_t *target = (xfs_buftarg_t *)data; + int count; + xfs_buf_t *bp; current->flags |= PF_MEMALLOC; - INIT_LIST_HEAD(&tmp); + set_freezable(); + do { if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); @@ -1695,42 +1718,24 @@ xfsbufd( schedule_timeout_interruptible( xfs_buf_timer_centisecs * msecs_to_jiffies(10)); - age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { - XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { - if (!test_bit(XBT_FORCE_FLUSH, - &target->bt_flags) && - time_before(jiffies, - bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - list_move(&bp->b_list, &tmp); - } - } - spin_unlock(dwlk); + xfs_buf_delwri_split(target, &tmp, + xfs_buf_age_centisecs * msecs_to_jiffies(10)); + count = 0; while (!list_empty(&tmp)) { bp = list_entry(tmp.next, xfs_buf_t, b_list); ASSERT(target == bp->b_target); list_del_init(&bp->b_list); xfs_buf_iostrategy(bp); - - blk_run_address_space(target->bt_mapping); + count++; } if (as_list_len > 0) purge_addresses(); + if (count) + blk_run_address_space(target->bt_mapping); - clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); } while (!kthread_should_stop()); return 0; @@ -1743,40 +1748,24 @@ xfsbufd( */ int xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) + xfs_buftarg_t *target, + int wait) { - struct list_head tmp; - xfs_buf_t *bp, *n; - int pincount = 0; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; + struct list_head tmp; + xfs_buf_t *bp, *n; + int pincount = 0; xfs_buf_runall_queues(xfsdatad_workqueue); xfs_buf_runall_queues(xfslogd_workqueue); - INIT_LIST_HEAD(&tmp); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { - ASSERT(bp->b_target == target); - ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q)); - XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp)); - if (xfs_buf_ispin(bp)) { - pincount++; - continue; - } - - list_move(&bp->b_list, &tmp); - } - spin_unlock(dwlk); + set_bit(XBT_FORCE_FLUSH, &target->bt_flags); + pincount = xfs_buf_delwri_split(target, &tmp, 0); /* * Dropped the delayed write list lock, now walk the temporary list */ list_for_each_entry_safe(bp, n, &tmp, b_list) { - xfs_buf_lock(bp); - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; + ASSERT(target == bp->b_target); if (wait) bp->b_flags &= ~XBF_ASYNC; else @@ -1785,6 +1774,9 @@ xfs_flush_buftarg( xfs_buf_iostrategy(bp); } + if (wait) + blk_run_address_space(target->bt_mapping); + /* * Remaining list items must be flushed before returning */ @@ -1796,9 +1788,6 @@ xfs_flush_buftarg( xfs_buf_relse(bp); } - if (wait) - blk_run_address_space(target->bt_mapping); - return pincount; } @@ -1806,7 +1795,7 @@ int __init xfs_buf_init(void) { #ifdef XFS_BUF_TRACE - xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); + xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS); #endif xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", @@ -1822,14 +1811,9 @@ xfs_buf_init(void) if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; - xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup); - if (!xfs_buf_shake) - goto out_destroy_xfsdatad_workqueue; - + register_shrinker(&xfs_buf_shake); return 0; - out_destroy_xfsdatad_workqueue: - destroy_workqueue(xfsdatad_workqueue); out_destroy_xfslogd_workqueue: destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: @@ -1844,7 +1828,7 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - kmem_shake_deregister(xfs_buf_shake); + unregister_shrinker(&xfs_buf_shake); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); @@ -1852,3 +1836,11 @@ xfs_buf_terminate(void) ktrace_free(xfs_buf_trace_buf); #endif } + +#ifdef CONFIG_KDB_MODULES +struct list_head * +xfs_get_buftarg_list(void) +{ + return &xfs_buftarg_list; +} +#endif diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 7858703..456519a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -63,14 +63,41 @@ typedef enum { /* flags used only internally */ _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ - _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ + _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ + + /* + * Special flag for supporting metadata blocks smaller than a FSB. + * + * In this case we can have multiple xfs_buf_t on a single page and + * need to lock out concurrent xfs_buf_t readers as they only + * serialise access to the buffer. + * + * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation + * between reads of the page. Hence we can have one thread read the + * page and modify it, but then race with another thread that thinks + * the page is not up-to-date and hence reads it again. + * + * The result is that the first modifcation to the page is lost. + * This sort of AGF/AGI reading race can happen when unlinking inodes + * that require truncation and results in the AGI unlinked list + * modifications being lost. + */ + _XBF_PAGE_LOCKED = (1 << 22), + + /* + * If we try a barrier write, but it fails we have to communicate + * this to the upper layers. Unfortunately b_error gets overwritten + * when the buffer is re-issued so we have to add another flag to + * keep this information. + */ + _XFS_BARRIER_FAILED = (1 << 23), } xfs_buf_flags_t; typedef enum { - XBT_FORCE_SLEEP = (0 << 1), - XBT_FORCE_FLUSH = (1 << 1), + XBT_FORCE_SLEEP = 0, + XBT_FORCE_FLUSH = 1, } xfs_buftarg_flags_t; typedef struct xfs_bufhash { @@ -138,12 +165,11 @@ typedef struct xfs_buf { xfs_buf_iodone_t b_iodone; /* I/O completion function */ xfs_buf_relse_t b_relse; /* releasing function */ xfs_buf_bdstrat_t b_strat; /* pre-write function */ - struct semaphore b_iodonesema; /* Semaphore for I/O waiters */ + struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; void *b_fspriv3; unsigned short b_error; /* error code on I/O */ - unsigned short b_locked; /* page array is locked */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ struct page **b_pages; /* array of page pointers */ @@ -298,11 +324,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_ISUNINITIAL(bp) (0) -#define XFS_BUF_UNUNINITIAL(bp) (0) - -#define XFS_BUF_BP_ISMAPPED(bp) (1) - #define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) #define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) #define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) @@ -339,7 +360,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) #define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) #define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) -#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema); +#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); #define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) #define XFS_BUF_TARGET(bp) ((bp)->b_target) @@ -393,13 +414,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp) return error; } -#define XFS_bdwrite(bp) xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC) - -static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) +/* + * No error can be returned from xfs_buf_iostart for delwri + * buffers as they are queued and no I/O is issued. + */ +static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp) { bp->b_strat = xfs_bdstrat_cb; bp->b_fspriv3 = mp; - return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); + (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); } #define XFS_bdstrat(bp) xfs_buf_iorequest(bp) @@ -414,10 +437,13 @@ static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *, int); +extern void xfs_free_buftarg(xfs_buftarg_t *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); +#ifdef CONFIG_KDB_MODULES +extern struct list_head *xfs_get_buftarg_list(void); +#endif #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h index e7f3da6..652721c 100644 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ b/fs/xfs/linux-2.6/xfs_cred.h @@ -30,7 +30,7 @@ typedef struct cred { extern struct cred *sys_cred; /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */ -static __inline int capable_cred(cred_t *cr, int cid) +static inline int capable_cred(cred_t *cr, int cid) { return (cr == sys_cred) ? 1 : capable(cid); } diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h new file mode 100644 index 0000000..a8b0b16 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_dmapi_priv.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DMAPI_PRIV_H__ +#define __XFS_DMAPI_PRIV_H__ + +/* + * Based on IO_ISDIRECT, decide which i_ flag is set. + */ +#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ + DM_FLAGS_IMUX : 0) +#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) + +#endif /*__XFS_DMAPI_PRIV_H__*/ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 5fb75d9..7f7abec 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -17,72 +17,40 @@ */ #include "xfs.h" #include "xfs_types.h" -#include "xfs_dmapi.h" +#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_export.h" - -STATIC struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, }; +#include "xfs_vnodeops.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_vfsops.h" /* - * XFS encodes and decodes the fileid portion of NFS filehandles - * itself instead of letting the generic NFS code do it. This - * allows filesystems with 64 bit inode numbers to be exported. - * - * Note that a side effect is that xfs_vget() won't be passed a - * zero inode/generation pair under normal circumstances. As - * however a malicious client could send us such data, the check - * remains in that code. + * Note that we only accept fileids which are long enough rather than allow + * the parent generation number to default to zero. XFS considers zero a + * valid generation number not an invalid/wildcard value. */ - -STATIC struct dentry * -xfs_fs_decode_fh( - struct super_block *sb, - __u32 *fh, - int fh_len, - int fileid_type, - int (*acceptable)( - void *context, - struct dentry *de), - void *context) +static int xfs_fileid_length(int fileid_type) { - xfs_fid2_t ifid; - xfs_fid2_t pfid; - void *parent = NULL; - int is64 = 0; - __u32 *p = fh; - -#if XFS_BIG_INUMS - is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG); - fileid_type &= ~XFS_FILEID_TYPE_64FLAG; -#endif - - /* - * Note that we only accept fileids which are long enough - * rather than allow the parent generation number to default - * to zero. XFS considers zero a valid generation number not - * an invalid/wildcard value. There's little point printk'ing - * a warning here as we don't have the client information - * which would make such a warning useful. - */ - if (fileid_type > 2 || - fh_len < xfs_fileid_length((fileid_type == 2), is64)) - return NULL; - - p = xfs_fileid_decode_fid2(p, &ifid, is64); - - if (fileid_type == 2) { - p = xfs_fileid_decode_fid2(p, &pfid, is64); - parent = &pfid; + switch (fileid_type) { + case FILEID_INO32_GEN: + return 2; + case FILEID_INO32_GEN_PARENT: + return 4; + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + return 3; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + return 6; } - - fh = (__u32 *)&ifid; - return sb->s_export_op->find_exported_dentry(sb, fh, parent, acceptable, context); + return 255; /* invalid */ } - STATIC int xfs_fs_encode_fh( struct dentry *dentry, @@ -90,23 +58,21 @@ xfs_fs_encode_fh( int *max_len, int connectable) { + struct fid *fid = (struct fid *)fh; + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; struct inode *inode = dentry->d_inode; - int type = 1; - __u32 *p = fh; + int fileid_type; int len; - int is64 = 0; -#if XFS_BIG_INUMS - bhv_vfs_t *vfs = vfs_from_sb(inode->i_sb); - - if (!(vfs->vfs_flag & VFS_32BITINODES)) { - /* filesystem may contain 64bit inode numbers */ - is64 = XFS_FILEID_TYPE_64FLAG; - } -#endif /* Directories don't need their parent encoded, they have ".." */ - if (S_ISDIR(inode->i_mode)) - connectable = 0; + if (S_ISDIR(inode->i_mode) || !connectable) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + + /* filesystem may contain 64bit inode numbers */ + if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) + fileid_type |= XFS_FILEID_TYPE_64FLAG; /* * Only encode if there is enough space given. In practice @@ -114,44 +80,111 @@ xfs_fs_encode_fh( * over NFSv2 with the subtree_check export option; the other * seven combinations work. The real answer is "don't use v2". */ - len = xfs_fileid_length(connectable, is64); + len = xfs_fileid_length(fileid_type); if (*max_len < len) return 255; *max_len = len; - p = xfs_fileid_encode_inode(p, inode, is64); - if (connectable) { + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: spin_lock(&dentry->d_lock); - p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64); + fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino; + fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; spin_unlock(&dentry->d_lock); - type = 2; + /*FALLTHRU*/ + case FILEID_INO32_GEN: + fid->i32.ino = inode->i_ino; + fid->i32.gen = inode->i_generation; + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + spin_lock(&dentry->d_lock); + fid64->parent_ino = dentry->d_parent->d_inode->i_ino; + fid64->parent_gen = dentry->d_parent->d_inode->i_generation; + spin_unlock(&dentry->d_lock); + /*FALLTHRU*/ + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + fid64->ino = inode->i_ino; + fid64->gen = inode->i_generation; + break; + } + + return fileid_type; +} + +STATIC struct inode * +xfs_nfs_get_inode( + struct super_block *sb, + u64 ino, + u32 generation) + { + xfs_mount_t *mp = XFS_M(sb); + xfs_inode_t *ip; + int error; + + /* + * NFS can sometimes send requests for ino 0. Fail them gracefully. + */ + if (ino == 0) + return ERR_PTR(-ESTALE); + + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); + if (error) + return ERR_PTR(-error); + if (!ip) + return ERR_PTR(-EIO); + + if (ip->i_d.di_gen != generation) { + xfs_iput_new(ip, XFS_ILOCK_SHARED); + return ERR_PTR(-ENOENT); } - BUG_ON((p - fh) != len); - return type | is64; + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return VFS_I(ip); } STATIC struct dentry * -xfs_fs_get_dentry( - struct super_block *sb, - void *data) +xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) { - bhv_vnode_t *vp; - struct inode *inode; - struct dentry *result; - bhv_vfs_t *vfsp = vfs_from_sb(sb); - int error; + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; - error = bhv_vfs_vget(vfsp, &vp, (fid_t *)data); - if (error || vp == NULL) - return ERR_PTR(-ESTALE) ; + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; - inode = vn_to_inode(vp); - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + case FILEID_INO32_GEN: + inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen); + break; } - return result; + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, + fid->i32.parent_gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->parent_ino, + fid64->parent_gen); + break; + } + + return d_obtain_alias(inode); } STATIC struct dentry * @@ -159,26 +192,18 @@ xfs_fs_get_parent( struct dentry *child) { int error; - bhv_vnode_t *vp, *cvp; - struct dentry *parent; + struct xfs_inode *cip; - cvp = NULL; - vp = vn_from_inode(child->d_inode); - error = bhv_vop_lookup(vp, &dotdot, &cvp, 0, NULL, NULL); + error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); if (unlikely(error)) return ERR_PTR(-error); - parent = d_alloc_anon(vn_to_inode(cvp)); - if (unlikely(!parent)) { - VN_RELE(cvp); - return ERR_PTR(-ENOMEM); - } - return parent; + return d_obtain_alias(VFS_I(cip)); } -struct export_operations xfs_export_operations = { - .decode_fh = xfs_fs_decode_fh, +const struct export_operations xfs_export_operations = { .encode_fh = xfs_fs_encode_fh, + .fh_to_dentry = xfs_fs_fh_to_dentry, + .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, - .get_dentry = xfs_fs_get_dentry, }; diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h index e794ca4..3272b6a 100644 --- a/fs/xfs/linux-2.6/xfs_export.h +++ b/fs/xfs/linux-2.6/xfs_export.h @@ -59,50 +59,14 @@ * a subdirectory) or use the "fsid" export option. */ +struct xfs_fid64 { + u64 ino; + u32 gen; + u64 parent_ino; + u32 parent_gen; +} __attribute__((packed)); + /* This flag goes on the wire. Don't play with it. */ #define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ -/* Calculate the length in u32 units of the fileid data */ -static inline int -xfs_fileid_length(int hasparent, int is64) -{ - return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2); -} - -/* - * Decode encoded inode information (either for the inode itself - * or the parent) into an xfs_fid2_t structure. Advances and - * returns the new data pointer - */ -static inline __u32 * -xfs_fileid_decode_fid2(__u32 *p, xfs_fid2_t *fid, int is64) -{ - fid->fid_len = sizeof(xfs_fid2_t) - sizeof(fid->fid_len); - fid->fid_pad = 0; - fid->fid_ino = *p++; -#if XFS_BIG_INUMS - if (is64) - fid->fid_ino |= (((__u64)(*p++)) << 32); -#endif - fid->fid_gen = *p++; - return p; -} - -/* - * Encode inode information (either for the inode itself or the - * parent) into a fileid buffer. Advances and returns the new - * data pointer. - */ -static inline __u32 * -xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64) -{ - *p++ = (__u32)inode->i_ino; -#if XFS_BIG_INUMS - if (is64) - *p++ = (__u32)(inode->i_ino >> 32); -#endif - *p++ = inode->i_generation; - return p; -} - #endif /* __XFS_EXPORT_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 3d4f6df..3fee790 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -37,204 +37,85 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_ioctl32.h" +#include "xfs_vnodeops.h" #include <linux/dcache.h> #include <linux/smp_lock.h> static struct vm_operations_struct xfs_file_vm_ops; -#ifdef CONFIG_XFS_DMAPI -static struct vm_operations_struct xfs_dmapi_file_vm_ops; -#endif -STATIC inline ssize_t +STATIC_INLINE ssize_t __xfs_file_read( struct kiocb *iocb, - char __user *buf, + const struct iovec *iov, + unsigned long nr_segs, int ioflags, - size_t count, loff_t pos) { - struct iovec iov = {buf, count}; struct file *file = iocb->ki_filp; - bhv_vnode_t *vp = vn_from_inode(file->f_dentry->d_inode); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; - return bhv_vop_read(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL); + return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, + nr_segs, &iocb->ki_pos, ioflags); } STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, - char __user *buf, - size_t count, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - return __xfs_file_read(iocb, buf, IO_ISAIO, count, pos); + return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos); } STATIC ssize_t xfs_file_aio_read_invis( struct kiocb *iocb, - char __user *buf, - size_t count, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - return __xfs_file_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); + return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos); } -STATIC inline ssize_t +STATIC_INLINE ssize_t __xfs_file_write( - struct kiocb *iocb, - const char __user *buf, - int ioflags, - size_t count, - loff_t pos) + struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, + int ioflags, + loff_t pos) { - struct iovec iov = {(void __user *)buf, count}; struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; - return bhv_vop_write(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL); + return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, + &iocb->ki_pos, ioflags); } STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, - const char __user *buf, - size_t count, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - return __xfs_file_write(iocb, buf, IO_ISAIO, count, pos); + return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos); } STATIC ssize_t xfs_file_aio_write_invis( struct kiocb *iocb, - const char __user *buf, - size_t count, - loff_t pos) -{ - return __xfs_file_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); -} - -STATIC inline ssize_t -__xfs_file_readv( - struct file *file, - const struct iovec *iov, - int ioflags, - unsigned long nr_segs, - loff_t *ppos) -{ - struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - struct kiocb kiocb; - ssize_t rval; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - rval = bhv_vop_read(vp, &kiocb, iov, nr_segs, - &kiocb.ki_pos, ioflags, NULL); - - *ppos = kiocb.ki_pos; - return rval; -} - -STATIC ssize_t -xfs_file_readv( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_readv(file, iov, 0, nr_segs, ppos); -} - -STATIC ssize_t -xfs_file_readv_invis( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_readv(file, iov, IO_INVIS, nr_segs, ppos); -} - -STATIC inline ssize_t -__xfs_file_writev( - struct file *file, - const struct iovec *iov, - int ioflags, - unsigned long nr_segs, - loff_t *ppos) -{ - struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - struct kiocb kiocb; - ssize_t rval; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - - rval = bhv_vop_write(vp, &kiocb, iov, nr_segs, - &kiocb.ki_pos, ioflags, NULL); - - *ppos = kiocb.ki_pos; - return rval; -} - -STATIC ssize_t -xfs_file_writev( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_writev(file, iov, 0, nr_segs, ppos); -} - -STATIC ssize_t -xfs_file_writev_invis( - struct file *file, - const struct iovec *iov, + const struct iovec *iov, unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_writev(file, iov, IO_INVIS, nr_segs, ppos); -} - -STATIC ssize_t -xfs_file_sendfile( - struct file *filp, - loff_t *pos, - size_t count, - read_actor_t actor, - void *target) -{ - return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode), - filp, pos, 0, count, actor, target, NULL); -} - -STATIC ssize_t -xfs_file_sendfile_invis( - struct file *filp, - loff_t *pos, - size_t count, - read_actor_t actor, - void *target) + loff_t pos) { - return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode), - filp, pos, IO_INVIS, count, actor, target, NULL); + return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos); } STATIC ssize_t @@ -245,8 +126,8 @@ xfs_file_splice_read( size_t len, unsigned int flags) { - return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode), - infilp, ppos, pipe, len, flags, 0, NULL); + return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), + infilp, ppos, pipe, len, flags, 0); } STATIC ssize_t @@ -257,9 +138,8 @@ xfs_file_splice_read_invis( size_t len, unsigned int flags) { - return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode), - infilp, ppos, pipe, len, flags, IO_INVIS, - NULL); + return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), + infilp, ppos, pipe, len, flags, IO_INVIS); } STATIC ssize_t @@ -270,8 +150,8 @@ xfs_file_splice_write( size_t len, unsigned int flags) { - return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode), - pipe, outfilp, ppos, len, flags, 0, NULL); + return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), + pipe, outfilp, ppos, len, flags, 0); } STATIC ssize_t @@ -282,9 +162,8 @@ xfs_file_splice_write_invis( size_t len, unsigned int flags) { - return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode), - pipe, outfilp, ppos, len, flags, IO_INVIS, - NULL); + return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), + pipe, outfilp, ppos, len, flags, IO_INVIS); } STATIC int @@ -294,16 +173,7 @@ xfs_file_open( { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EFBIG; - return -bhv_vop_open(vn_from_inode(inode), NULL); -} - -STATIC int -xfs_file_close( - struct file *filp, - fl_owner_t id) -{ - return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0, - file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL); + return -xfs_open(XFS_I(inode)); } STATIC int @@ -311,120 +181,59 @@ xfs_file_release( struct inode *inode, struct file *filp) { - bhv_vnode_t *vp = vn_from_inode(inode); - - if (vp) - return -bhv_vop_release(vp); - return 0; + return -xfs_release(XFS_I(inode)); } +/* + * We ignore the datasync flag here because a datasync is effectively + * identical to an fsync. That is, datasync implies that we need to write + * only the metadata needed to be able to access the data that is written + * if we crash after the call completes. Hence if we are writing beyond + * EOF we have to log the inode size change as well, which makes it a + * full fsync. If we don't write beyond EOF, the inode core will be + * clean in memory and so we don't need to log the inode, just like + * fsync. + */ STATIC int xfs_file_fsync( struct file *filp, struct dentry *dentry, int datasync) { - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - int flags = FSYNC_WAIT; - - if (datasync) - flags |= FSYNC_DATA; - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); - return -bhv_vop_fsync(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1); + xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); + return -xfs_fsync(XFS_I(dentry->d_inode)); } -#ifdef CONFIG_XFS_DMAPI -STATIC struct page * -xfs_vm_nopage( - struct vm_area_struct *area, - unsigned long address, - int *type) -{ - struct inode *inode = area->vm_file->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - - ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI); - if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0)) - return NULL; - return filemap_nopage(area, address, type); -} -#endif /* CONFIG_XFS_DMAPI */ - STATIC int xfs_file_readdir( struct file *filp, void *dirent, filldir_t filldir) { - int error = 0; - bhv_vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); - uio_t uio; - iovec_t iov; - int eof = 0; - caddr_t read_buf; - int namelen, size = 0; - size_t rlen = PAGE_CACHE_SIZE; - xfs_off_t start_offset, curr_offset; - xfs_dirent_t *dbp = NULL; - - /* Try fairly hard to get memory */ - do { - if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) - break; - rlen >>= 1; - } while (rlen >= 1024); - - if (read_buf == NULL) - return -ENOMEM; - - uio.uio_iov = &iov; - uio.uio_segflg = UIO_SYSSPACE; - curr_offset = filp->f_pos; - if (filp->f_pos != 0x7fffffff) - uio.uio_offset = filp->f_pos; - else - uio.uio_offset = 0xffffffff; - - while (!eof) { - uio.uio_resid = iov.iov_len = rlen; - iov.iov_base = read_buf; - uio.uio_iovcnt = 1; - - start_offset = uio.uio_offset; - - error = bhv_vop_readdir(vp, &uio, NULL, &eof); - if ((uio.uio_offset == start_offset) || error) { - size = 0; - break; - } - - size = rlen - uio.uio_resid; - dbp = (xfs_dirent_t *)read_buf; - while (size > 0) { - namelen = strlen(dbp->d_name); - - if (filldir(dirent, dbp->d_name, namelen, - (loff_t) curr_offset & 0x7fffffff, - (ino_t) dbp->d_ino, - DT_UNKNOWN)) { - goto done; - } - size -= dbp->d_reclen; - curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */; - dbp = (xfs_dirent_t *)((char *)dbp + dbp->d_reclen); - } - } -done: - if (!error) { - if (size == 0) - filp->f_pos = uio.uio_offset & 0x7fffffff; - else if (dbp) - filp->f_pos = curr_offset; - } - - kfree(read_buf); - return -error; + struct inode *inode = filp->f_path.dentry->d_inode; + xfs_inode_t *ip = XFS_I(inode); + int error; + size_t bufsize; + + /* + * The Linux API doesn't pass down the total size of the buffer + * we read into down to the filesystem. With the filldir concept + * it's not needed for correct information, but the XFS dir2 leaf + * code wants an estimate of the buffer size to calculate it's + * readahead window and size the buffers used for mapping to + * physical blocks. + * + * Try to give it an estimate that's good enough, maybe at some + * point we can change the ->readdir prototype to include the + * buffer size. + */ + bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size); + + error = xfs_readdir(ip, dirent, bufsize, + (xfs_off_t *)&filp->f_pos, filldir); + if (error) + return -error; + return 0; } STATIC int @@ -433,11 +242,7 @@ xfs_file_mmap( struct vm_area_struct *vma) { vma->vm_ops = &xfs_file_vm_ops; - -#ifdef CONFIG_XFS_DMAPI - if (vn_from_inode(filp->f_dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI) - vma->vm_ops = &xfs_dmapi_file_vm_ops; -#endif /* CONFIG_XFS_DMAPI */ + vma->vm_flags |= VM_CAN_NONLINEAR; file_accessed(filp); return 0; @@ -450,11 +255,10 @@ xfs_file_ioctl( unsigned long p) { int error; - struct inode *inode = filp->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); + struct inode *inode = filp->f_path.dentry->d_inode; - error = bhv_vop_ioctl(vp, inode, filp, 0, cmd, (void __user *)p); - VMODIFY(vp); + error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p); + xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED); /* NOTE: some of the ioctl's return positive #'s as a * byte count indicating success, such as @@ -472,11 +276,10 @@ xfs_file_ioctl_invis( unsigned long p) { int error; - struct inode *inode = filp->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); + struct inode *inode = filp->f_path.dentry->d_inode; - error = bhv_vop_ioctl(vp, inode, filp, IO_INVIS, cmd, (void __user *)p); - VMODIFY(vp); + error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p); + xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED); /* NOTE: some of the ioctl's return positive #'s as a * byte count indicating success, such as @@ -487,64 +290,26 @@ xfs_file_ioctl_invis( return error; } -#ifdef CONFIG_XFS_DMAPI -#ifdef HAVE_VMOP_MPROTECT -STATIC int -xfs_vm_mprotect( - struct vm_area_struct *vma, - unsigned int newflags) -{ - bhv_vnode_t *vp = vn_from_inode(vma->vm_file->f_dentry->d_inode); - int error = 0; - - if (vp->v_vfsp->vfs_flag & VFS_DMI) { - if ((vma->vm_flags & VM_MAYSHARE) && - (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) { - xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); - - error = XFS_SEND_MMAP(mp, vma, VM_WRITE); - } - } - return error; -} -#endif /* HAVE_VMOP_MPROTECT */ -#endif /* CONFIG_XFS_DMAPI */ - -#ifdef HAVE_FOP_OPEN_EXEC -/* If the user is attempting to execute a file that is offline then - * we have to trigger a DMAPI READ event before the file is marked as busy - * otherwise the invisible I/O will not be able to write to the file to bring - * it back online. +/* + * mmap()d file has taken write protection fault and is being made + * writable. We can set the page state up correctly for a writable + * page, which means we can do correct delalloc accounting (ENOSPC + * checking!) and unwritten extent mapping. */ STATIC int -xfs_file_open_exec( - struct inode *inode) +xfs_vm_page_mkwrite( + struct vm_area_struct *vma, + struct page *page) { - bhv_vnode_t *vp = vn_from_inode(inode); - - if (unlikely(vp->v_vfsp->vfs_flag & VFS_DMI)) { - xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); - xfs_inode_t *ip = xfs_vtoi(vp); - - if (!ip) - return -EINVAL; - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) - return -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, - 0, 0, 0, NULL); - } - return 0; + return block_page_mkwrite(vma, page, xfs_get_blocks); } -#endif /* HAVE_FOP_OPEN_EXEC */ const struct file_operations xfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, - .readv = xfs_file_readv, - .writev = xfs_file_writev, .aio_read = xfs_file_aio_read, .aio_write = xfs_file_aio_write, - .sendfile = xfs_file_sendfile, .splice_read = xfs_file_splice_read, .splice_write = xfs_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, @@ -553,7 +318,6 @@ const struct file_operations xfs_file_operations = { #endif .mmap = xfs_file_mmap, .open = xfs_file_open, - .flush = xfs_file_close, .release = xfs_file_release, .fsync = xfs_file_fsync, #ifdef HAVE_FOP_OPEN_EXEC @@ -565,11 +329,8 @@ const struct file_operations xfs_invis_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, - .readv = xfs_file_readv_invis, - .writev = xfs_file_writev_invis, .aio_read = xfs_file_aio_read_invis, .aio_write = xfs_file_aio_write_invis, - .sendfile = xfs_file_sendfile_invis, .splice_read = xfs_file_splice_read_invis, .splice_write = xfs_file_splice_write_invis, .unlocked_ioctl = xfs_file_ioctl_invis, @@ -578,7 +339,6 @@ const struct file_operations xfs_invis_file_operations = { #endif .mmap = xfs_file_mmap, .open = xfs_file_open, - .flush = xfs_file_close, .release = xfs_file_release, .fsync = xfs_file_fsync, }; @@ -587,6 +347,7 @@ const struct file_operations xfs_invis_file_operations = { const struct file_operations xfs_dir_file_operations = { .read = generic_read_dir, .readdir = xfs_file_readdir, + .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, @@ -595,16 +356,6 @@ const struct file_operations xfs_dir_file_operations = { }; static struct vm_operations_struct xfs_file_vm_ops = { - .nopage = filemap_nopage, - .populate = filemap_populate, -}; - -#ifdef CONFIG_XFS_DMAPI -static struct vm_operations_struct xfs_dmapi_file_vm_ops = { - .nopage = xfs_vm_nopage, - .populate = filemap_populate, -#ifdef HAVE_VMOP_MPROTECT - .mprotect = xfs_vm_mprotect, -#endif + .fault = filemap_fault, + .page_mkwrite = xfs_vm_page_mkwrite, }; -#endif /* CONFIG_XFS_DMAPI */ diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index dc05628..36caa6d 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -16,61 +16,66 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_vnodeops.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" int fs_noerr(void) { return 0; } int fs_nosys(void) { return ENOSYS; } void fs_noval(void) { return; } void -fs_tosspages( - bhv_desc_t *bdp, +xfs_tosspages( + xfs_inode_t *ip, xfs_off_t first, xfs_off_t last, int fiopt) { - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - struct inode *ip = vn_to_inode(vp); + struct address_space *mapping = VFS_I(ip)->i_mapping; - if (VN_CACHED(vp)) - truncate_inode_pages(ip->i_mapping, first); + if (mapping->nrpages) + truncate_inode_pages(mapping, first); } -void -fs_flushinval_pages( - bhv_desc_t *bdp, +int +xfs_flushinval_pages( + xfs_inode_t *ip, xfs_off_t first, xfs_off_t last, int fiopt) { - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - struct inode *ip = vn_to_inode(vp); + struct address_space *mapping = VFS_I(ip)->i_mapping; + int ret = 0; - if (VN_CACHED(vp)) { - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); - filemap_write_and_wait(ip->i_mapping); - truncate_inode_pages(ip->i_mapping, first); + if (mapping->nrpages) { + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = filemap_write_and_wait(mapping); + if (!ret) + truncate_inode_pages(mapping, first); } + return ret; } int -fs_flush_pages( - bhv_desc_t *bdp, +xfs_flush_pages( + xfs_inode_t *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt) { - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - struct inode *ip = vn_to_inode(vp); + struct address_space *mapping = VFS_I(ip)->i_mapping; + int ret = 0; + int ret2; - if (VN_DIRTY(vp)) { - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); - filemap_fdatawrite(ip->i_mapping); + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = filemap_fdatawrite(mapping); if (flags & XFS_B_ASYNC) - return 0; - filemap_fdatawait(ip->i_mapping); + return ret; + ret2 = filemap_fdatawait(mapping); + if (!ret) + ret = ret2; } - return 0; + return ret; } diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h index aee9ccd..82bb19b 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.h +++ b/fs/xfs/linux-2.6/xfs_fs_subr.h @@ -18,12 +18,8 @@ #ifndef __XFS_FS_SUBR_H__ #define __XFS_FS_SUBR_H__ -struct cred; extern int fs_noerr(void); extern int fs_nosys(void); extern void fs_noval(void); -extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -extern int fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int); #endif /* __XFS_FS_SUBR_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c index 6c162c3..ef90e64 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -20,11 +20,6 @@ #include "xfs_sysctl.h" /* - * System memory size - used to scale certain data structures in XFS. - */ -unsigned long xfs_physmem; - -/* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. * 100ths of a second). @@ -34,7 +29,7 @@ xfs_param_t xfs_params = { .restrict_chown = { 0, 1, 1 }, .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 127 }, + .panic_mask = { 0, 0, 255 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, @@ -46,10 +41,12 @@ xfs_param_t xfs_params = { .inherit_nosym = { 0, 0, 1 }, .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, + .fstrm_timer = { 1, 30*100, 3600*100}, }; /* * Global system credential structure. */ -cred_t sys_cred_val, *sys_cred = &sys_cred_val; +static cred_t sys_cred_val; +cred_t *sys_cred = &sys_cred_val; diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h index e1a22bf..2770b00 100644 --- a/fs/xfs/linux-2.6/xfs_globals.h +++ b/fs/xfs/linux-2.6/xfs_globals.h @@ -19,7 +19,6 @@ #define __XFS_GLOBALS_H__ extern uint64_t xfs_panic_mask; /* set to cause more panics */ -extern unsigned long xfs_physmem; extern struct cred *sys_cred; #endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 6e52a5d..d3438c7 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -41,14 +41,15 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_dfrag.h" #include "xfs_fsops.h" +#include "xfs_vnodeops.h" +#include "xfs_quota.h" +#include "xfs_inode_item.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -76,7 +77,6 @@ xfs_find_handle( xfs_handle_t handle; xfs_fsop_handlereq_t hreq; struct inode *inode; - bhv_vnode_t *vp; if (copy_from_user(&hreq, arg, sizeof(hreq))) return -XFS_ERROR(EFAULT); @@ -86,17 +86,15 @@ xfs_find_handle( switch (cmd) { case XFS_IOC_PATH_TO_FSHANDLE: case XFS_IOC_PATH_TO_HANDLE: { - struct nameidata nd; - int error; - - error = user_path_walk_link((const char __user *)hreq.path, &nd); + struct path path; + int error = user_lpath((const char __user *)hreq.path, &path); if (error) return error; - ASSERT(nd.dentry); - ASSERT(nd.dentry->d_inode); - inode = igrab(nd.dentry->d_inode); - path_release(&nd); + ASSERT(path.dentry); + ASSERT(path.dentry->d_inode); + inode = igrab(path.dentry->d_inode); + path_put(&path); break; } @@ -107,9 +105,9 @@ xfs_find_handle( if (!file) return -EBADF; - ASSERT(file->f_dentry); - ASSERT(file->f_dentry->d_inode); - inode = igrab(file->f_dentry->d_inode); + ASSERT(file->f_path.dentry); + ASSERT(file->f_path.dentry->d_inode); + inode = igrab(file->f_path.dentry->d_inode); fput(file); break; } @@ -135,28 +133,24 @@ xfs_find_handle( return -XFS_ERROR(EBADF); } - /* we need the vnode */ - vp = vn_from_inode(inode); - /* now we can grab the fsid */ - memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t)); + memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid, + sizeof(xfs_fsid_t)); hsize = sizeof(xfs_fsid_t); if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { - xfs_inode_t *ip; + xfs_inode_t *ip = XFS_I(inode); int lock_mode; /* need to get access to the xfs_inode to read the generation */ - ip = xfs_vtoi(vp); - ASSERT(ip); lock_mode = xfs_ilock_map_shared(ip); /* fill in fid section of handle from inode */ - handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) - - sizeof(handle.ha_fid.xfs_fid_len); - handle.ha_fid.xfs_fid_pad = 0; - handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen; - handle.ha_fid.xfs_fid_ino = ip->i_ino; + handle.ha_fid.fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.fid_len); + handle.ha_fid.fid_pad = 0; + handle.ha_fid.fid_gen = ip->i_d.di_gen; + handle.ha_fid.fid_ino = ip->i_ino; xfs_iunlock_map_shared(ip, lock_mode); @@ -176,21 +170,19 @@ xfs_find_handle( /* - * Convert userspace handle data into vnode (and inode). - * We [ab]use the fact that all the fsop_handlereq ioctl calls - * have a data structure argument whose first component is always - * a xfs_fsop_handlereq_t, so we can cast to and from this type. - * This allows us to optimise the copy_from_user calls and gives - * a handy, shared routine. + * Convert userspace handle data into inode. + * + * We use the fact that all the fsop_handlereq ioctl calls have a data + * structure argument whose first component is always a xfs_fsop_handlereq_t, + * so we can pass that sub structure into this handy, shared routine. * - * If no error, caller must always VN_RELE the returned vp. + * If no error, caller must always iput the returned inode. */ STATIC int xfs_vget_fsop_handlereq( xfs_mount_t *mp, struct inode *parinode, /* parent inode pointer */ xfs_fsop_handlereq_t *hreq, - bhv_vnode_t **vp, struct inode **inode) { void __user *hanp; @@ -199,8 +191,6 @@ xfs_vget_fsop_handlereq( xfs_handle_t *handlep; xfs_handle_t handle; xfs_inode_t *ip; - struct inode *inodep; - bhv_vnode_t *vpp; xfs_ino_t ino; __u32 igen; int error; @@ -222,10 +212,10 @@ xfs_vget_fsop_handlereq( if (hlen < sizeof(*handlep)) memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); if (hlen > sizeof(handlep->ha_fsid)) { - if (handlep->ha_fid.xfs_fid_len != - (hlen - sizeof(handlep->ha_fsid) - - sizeof(handlep->ha_fid.xfs_fid_len)) - || handlep->ha_fid.xfs_fid_pad) + if (handlep->ha_fid.fid_len != + (hlen - sizeof(handlep->ha_fsid) - + sizeof(handlep->ha_fid.fid_len)) || + handlep->ha_fid.fid_pad) return XFS_ERROR(EINVAL); } @@ -233,32 +223,29 @@ xfs_vget_fsop_handlereq( * Crack the handle, obtain the inode # & generation # */ xfid = (struct xfs_fid *)&handlep->ha_fid; - if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) { - ino = xfid->xfs_fid_ino; - igen = xfid->xfs_fid_gen; + if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { + ino = xfid->fid_ino; + igen = xfid->fid_gen; } else { return XFS_ERROR(EINVAL); } /* - * Get the XFS inode, building a vnode to go with it. + * Get the XFS inode, building a Linux inode to go with it. */ error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); if (error) return error; if (ip == NULL) return XFS_ERROR(EIO); - if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { + if (ip->i_d.di_gen != igen) { xfs_iput_new(ip, XFS_ILOCK_SHARED); return XFS_ERROR(ENOENT); } - vpp = XFS_ITOV(ip); - inodep = vn_to_inode(vpp); xfs_iunlock(ip, XFS_ILOCK_SHARED); - *vp = vpp; - *inode = inodep; + *inode = VFS_I(ip); return 0; } @@ -275,7 +262,6 @@ xfs_open_by_handle( struct file *filp; struct inode *inode; struct dentry *dentry; - bhv_vnode_t *vp; xfs_fsop_handlereq_t hreq; if (!capable(CAP_SYS_ADMIN)) @@ -283,7 +269,7 @@ xfs_open_by_handle( if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); + error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode); if (error) return -error; @@ -325,134 +311,155 @@ xfs_open_by_handle( return new_fd; } - dentry = d_alloc_anon(inode); - if (dentry == NULL) { - iput(inode); + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { put_unused_fd(new_fd); - return -XFS_ERROR(ENOMEM); + return PTR_ERR(dentry); } /* Ensure umount returns EBUSY on umounts while this file is open. */ - mntget(parfilp->f_vfsmnt); + mntget(parfilp->f_path.mnt); /* Create file pointer. */ - filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags); + filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags); if (IS_ERR(filp)) { put_unused_fd(new_fd); return -XFS_ERROR(-PTR_ERR(filp)); } - if (inode->i_mode & S_IFREG) + if (inode->i_mode & S_IFREG) { + /* invisible operation should not change atime */ + filp->f_flags |= O_NOATIME; filp->f_op = &xfs_invis_file_operations; + } fd_install(new_fd, filp); return new_fd; } +/* + * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's + * unused first argument. + */ +STATIC int +do_readlink( + char __user *buffer, + int buflen, + const char *link) +{ + int len; + + len = PTR_ERR(link); + if (IS_ERR(link)) + goto out; + + len = strlen(link); + if (len > (unsigned) buflen) + len = buflen; + if (copy_to_user(buffer, link, len)) + len = -EFAULT; + out: + return len; +} + + STATIC int xfs_readlink_by_handle( xfs_mount_t *mp, void __user *arg, - struct file *parfilp, struct inode *parinode) { - int error; - struct iovec aiov; - struct uio auio; struct inode *inode; xfs_fsop_handlereq_t hreq; - bhv_vnode_t *vp; __u32 olen; + void *link; + int error; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); + error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode); if (error) return -error; /* Restrict this handle operation to symlinks only. */ if (!S_ISLNK(inode->i_mode)) { - VN_RELE(vp); - return -XFS_ERROR(EINVAL); + error = -XFS_ERROR(EINVAL); + goto out_iput; } if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) { - VN_RELE(vp); - return -XFS_ERROR(EFAULT); + error = -XFS_ERROR(EFAULT); + goto out_iput; } - aiov.iov_len = olen; - aiov.iov_base = hreq.ohandle; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_resid = olen; + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) + goto out_iput; - error = bhv_vop_readlink(vp, &auio, IO_INVIS, NULL); - VN_RELE(vp); + error = -xfs_readlink(XFS_I(inode), link); if (error) - return -error; + goto out_kfree; + error = do_readlink(hreq.ohandle, olen, link); + if (error) + goto out_kfree; - return (olen - auio.uio_resid); + out_kfree: + kfree(link); + out_iput: + iput(inode); + return error; } STATIC int xfs_fssetdm_by_handle( xfs_mount_t *mp, void __user *arg, - struct file *parfilp, struct inode *parinode) { int error; struct fsdmidata fsd; xfs_fsop_setdm_handlereq_t dmhreq; struct inode *inode; - bhv_desc_t *bdp; - bhv_vnode_t *vp; if (!capable(CAP_MKNOD)) return -XFS_ERROR(EPERM); if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode); + error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode); if (error) return -error; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { - VN_RELE(vp); - return -XFS_ERROR(EPERM); + error = -XFS_ERROR(EPERM); + goto out; } if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { - VN_RELE(vp); - return -XFS_ERROR(EFAULT); + error = -XFS_ERROR(EFAULT); + goto out; } - bdp = bhv_base_unlocked(VN_BHV_HEAD(vp)); - error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL); + error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, + fsd.fsd_dmstate); - VN_RELE(vp); - if (error) - return -error; - return 0; + out: + iput(inode); + return error; } STATIC int xfs_attrlist_by_handle( xfs_mount_t *mp, void __user *arg, - struct file *parfilp, struct inode *parinode) { int error; attrlist_cursor_kern_t *cursor; xfs_fsop_attrlist_handlereq_t al_hreq; struct inode *inode; - bhv_vnode_t *vp; char *kbuf; if (!capable(CAP_SYS_ADMIN)) @@ -462,8 +469,13 @@ xfs_attrlist_by_handle( if (al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); - error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, - &vp, &inode); + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -XFS_ERROR(EINVAL); + + error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode); if (error) goto out; @@ -472,8 +484,8 @@ xfs_attrlist_by_handle( goto out_vn_rele; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = bhv_vop_attr_list(vp, kbuf, al_hreq.buflen, al_hreq.flags, - cursor, NULL); + error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); if (error) goto out_kfree; @@ -483,14 +495,14 @@ xfs_attrlist_by_handle( out_kfree: kfree(kbuf); out_vn_rele: - VN_RELE(vp); + iput(inode); out: return -error; } STATIC int xfs_attrmulti_attr_get( - bhv_vnode_t *vp, + struct inode *inode, char *name, char __user *ubuf, __uint32_t *len, @@ -498,14 +510,14 @@ xfs_attrmulti_attr_get( { char *kbuf; int error = EFAULT; - + if (*len > XATTR_SIZE_MAX) return EINVAL; kbuf = kmalloc(*len, GFP_KERNEL); if (!kbuf) return ENOMEM; - error = bhv_vop_attr_get(vp, name, kbuf, len, flags, NULL); + error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) goto out_kfree; @@ -519,7 +531,7 @@ xfs_attrmulti_attr_get( STATIC int xfs_attrmulti_attr_set( - bhv_vnode_t *vp, + struct inode *inode, char *name, const char __user *ubuf, __uint32_t len, @@ -528,9 +540,7 @@ xfs_attrmulti_attr_set( char *kbuf; int error = EFAULT; - if (IS_RDONLY(&vp->v_inode)) - return -EROFS; - if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return EPERM; if (len > XATTR_SIZE_MAX) return EINVAL; @@ -541,8 +551,8 @@ xfs_attrmulti_attr_set( if (copy_from_user(kbuf, ubuf, len)) goto out_kfree; - - error = bhv_vop_attr_set(vp, name, kbuf, len, flags, NULL); + + error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); out_kfree: kfree(kbuf); @@ -551,15 +561,13 @@ xfs_attrmulti_attr_set( STATIC int xfs_attrmulti_attr_remove( - bhv_vnode_t *vp, + struct inode *inode, char *name, __uint32_t flags) { - if (IS_RDONLY(&vp->v_inode)) - return -EROFS; - if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return EPERM; - return bhv_vop_attr_remove(vp, name, flags, NULL); + return xfs_attr_remove(XFS_I(inode), name, flags); } STATIC int @@ -573,7 +581,6 @@ xfs_attrmulti_by_handle( xfs_attr_multiop_t *ops; xfs_fsop_attrmulti_handlereq_t am_hreq; struct inode *inode; - bhv_vnode_t *vp; unsigned int i, size; char *attr_name; @@ -582,12 +589,12 @@ xfs_attrmulti_by_handle( if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode); + error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode); if (error) goto out; error = E2BIG; - size = am_hreq.opcount * sizeof(attr_multiop_t); + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) goto out_vn_rele; @@ -616,18 +623,26 @@ xfs_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get(vp, + ops[i].am_error = xfs_attrmulti_attr_get(inode, attr_name, ops[i].am_attrvalue, &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = xfs_attrmulti_attr_set(vp, + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set(inode, attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; case ATTR_OP_REMOVE: - ops[i].am_error = xfs_attrmulti_attr_remove(vp, + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove(inode, attr_name, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; default: ops[i].am_error = EINVAL; @@ -641,350 +656,42 @@ xfs_attrmulti_by_handle( out_kfree_ops: kfree(ops); out_vn_rele: - VN_RELE(vp); + iput(inode); out: return -error; } -/* prototypes for a few of the stack-hungry cases that have - * their own functions. Functions are defined after their use - * so gcc doesn't get fancy and inline them with -03 */ - STATIC int xfs_ioc_space( - bhv_desc_t *bdp, - bhv_vnode_t *vp, - struct file *filp, - int flags, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_bulkstat( - xfs_mount_t *mp, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_fsgeometry_v1( - xfs_mount_t *mp, - void __user *arg); - -STATIC int -xfs_ioc_fsgeometry( - xfs_mount_t *mp, - void __user *arg); - -STATIC int -xfs_ioc_xattr( - bhv_vnode_t *vp, - xfs_inode_t *ip, - struct file *filp, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_getbmap( - bhv_desc_t *bdp, - struct file *filp, - int flags, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_getbmapx( - bhv_desc_t *bdp, - void __user *arg); - -int -xfs_ioctl( - bhv_desc_t *bdp, + struct xfs_inode *ip, struct inode *inode, struct file *filp, int ioflags, unsigned int cmd, void __user *arg) { - int error; - bhv_vnode_t *vp; - xfs_inode_t *ip; - xfs_mount_t *mp; - - vp = vn_from_inode(inode); - - vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - switch (cmd) { - - case XFS_IOC_ALLOCSP: - case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - /* - * Only allow the sys admin to reserve space unless - * unwritten extents are enabled. - */ - if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg); - - case XFS_IOC_DIOINFO: { - struct dioattr da; - xfs_buftarg_t *target = - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - da.d_mem = da.d_miniosz = 1 << target->bt_sshift; - da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - - if (copy_to_user(arg, &da, sizeof(da))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_FSBULKSTAT_SINGLE: - case XFS_IOC_FSBULKSTAT: - case XFS_IOC_FSINUMBERS: - return xfs_ioc_bulkstat(mp, cmd, arg); - - case XFS_IOC_FSGEOMETRY_V1: - return xfs_ioc_fsgeometry_v1(mp, arg); - - case XFS_IOC_FSGEOMETRY: - return xfs_ioc_fsgeometry(mp, arg); - - case XFS_IOC_GETVERSION: - case XFS_IOC_GETXFLAGS: - case XFS_IOC_SETXFLAGS: - case XFS_IOC_FSGETXATTR: - case XFS_IOC_FSSETXATTR: - case XFS_IOC_FSGETXATTRA: - return xfs_ioc_xattr(vp, ip, filp, cmd, arg); - - case XFS_IOC_FSSETDM: { - struct fsdmidata dmi; - - if (copy_from_user(&dmi, arg, sizeof(dmi))) - return -XFS_ERROR(EFAULT); - - error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate, - NULL); - return -error; - } - - case XFS_IOC_GETBMAP: - case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg); - - case XFS_IOC_GETBMAPX: - return xfs_ioc_getbmapx(bdp, arg); - - case XFS_IOC_FD_TO_HANDLE: - case XFS_IOC_PATH_TO_HANDLE: - case XFS_IOC_PATH_TO_FSHANDLE: - return xfs_find_handle(cmd, arg); - - case XFS_IOC_OPEN_BY_HANDLE: - return xfs_open_by_handle(mp, arg, filp, inode); - - case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(mp, arg, filp, inode); - - case XFS_IOC_READLINK_BY_HANDLE: - return xfs_readlink_by_handle(mp, arg, filp, inode); - - case XFS_IOC_ATTRLIST_BY_HANDLE: - return xfs_attrlist_by_handle(mp, arg, filp, inode); - - case XFS_IOC_ATTRMULTI_BY_HANDLE: - return xfs_attrmulti_by_handle(mp, arg, filp, inode); - - case XFS_IOC_SWAPEXT: { - error = xfs_swapext((struct xfs_swapext __user *)arg); - return -error; - } - - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; - - error = xfs_fs_counts(mp, &out); - if (error) - return -error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - __uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -XFS_ERROR(EFAULT); - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - if (error) - return -error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return -error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); - - return 0; - } - - case XFS_IOC_FSGROWFSDATA: { - xfs_growfs_data_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_data(mp, &in); - return -error; - } - - case XFS_IOC_FSGROWFSLOG: { - xfs_growfs_log_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_log(mp, &in); - return -error; - } - - case XFS_IOC_FSGROWFSRT: { - xfs_growfs_rt_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_rt(mp, &in); - return -error; - } - - case XFS_IOC_FREEZE: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (inode->i_sb->s_frozen == SB_UNFROZEN) - freeze_bdev(inode->i_sb->s_bdev); - return 0; - - case XFS_IOC_THAW: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (inode->i_sb->s_frozen != SB_UNFROZEN) - thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); - return 0; - - case XFS_IOC_GOINGDOWN: { - __uint32_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(in, (__uint32_t __user *)arg)) - return -XFS_ERROR(EFAULT); - - error = xfs_fs_goingdown(mp, in); - return -error; - } - - case XFS_IOC_ERROR_INJECTION: { - xfs_error_injection_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_errortag_add(in.errtag, mp); - return -error; - } - - case XFS_IOC_ERROR_CLEARALL: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_errortag_clearall(mp); - return -error; - - default: - return -ENOTTY; - } -} - -STATIC int -xfs_ioc_space( - bhv_desc_t *bdp, - bhv_vnode_t *vp, - struct file *filp, - int ioflags, - unsigned int cmd, - void __user *arg) -{ xfs_flock64_t bf; int attr_flags = 0; int error; - if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) return -XFS_ERROR(EPERM); if (!(filp->f_mode & FMODE_WRITE)) return -XFS_ERROR(EBADF); - if (!VN_ISREG(vp)) + if (!S_ISREG(inode->i_mode)) return -XFS_ERROR(EINVAL); if (copy_from_user(&bf, arg, sizeof(bf))) return -XFS_ERROR(EFAULT); if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; + attr_flags |= XFS_ATTR_NONBLOCK; if (ioflags & IO_INVIS) - attr_flags |= ATTR_DMI; + attr_flags |= XFS_ATTR_DMI; - error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos, + error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos, NULL, attr_flags); return -error; } @@ -1019,24 +726,20 @@ xfs_ioc_bulkstat( if ((count = bulkreq.icount) <= 0) return -XFS_ERROR(EINVAL); + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + if (cmd == XFS_IOC_FSINUMBERS) error = xfs_inumbers(mp, &inlast, &count, - bulkreq.ubuffer); + bulkreq.ubuffer, xfs_inumbers_fmt); else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) error = xfs_bulkstat_single(mp, &inlast, bulkreq.ubuffer, &done); - else { /* XFS_IOC_FSBULKSTAT */ - if (count == 1 && inlast != 0) { - inlast++; - error = xfs_bulkstat_single(mp, &inlast, - bulkreq.ubuffer, &done); - } else { - error = xfs_bulkstat(mp, &inlast, &count, - (bulkstat_one_pf)xfs_bulkstat_one, NULL, - sizeof(xfs_bstat_t), bulkreq.ubuffer, - BULKSTAT_FG_QUICK, &done); - } - } + else /* XFS_IOC_FSBULKSTAT */ + error = xfs_bulkstat(mp, &inlast, &count, + (bulkstat_one_pf)xfs_bulkstat_one, NULL, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + BULKSTAT_FG_QUICK, &done); if (error) return -error; @@ -1090,11 +793,6 @@ xfs_ioc_fsgeometry( /* * Linux extended inode flags interface. */ -#define LINUX_XFLAG_SYNC 0x00000008 /* Synchronous updates */ -#define LINUX_XFLAG_IMMUTABLE 0x00000010 /* Immutable file */ -#define LINUX_XFLAG_APPEND 0x00000020 /* writes to file may only append */ -#define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */ -#define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */ STATIC unsigned int xfs_merge_ioc_xflags( @@ -1103,23 +801,23 @@ xfs_merge_ioc_xflags( { unsigned int xflags = start; - if (flags & LINUX_XFLAG_IMMUTABLE) + if (flags & FS_IMMUTABLE_FL) xflags |= XFS_XFLAG_IMMUTABLE; else xflags &= ~XFS_XFLAG_IMMUTABLE; - if (flags & LINUX_XFLAG_APPEND) + if (flags & FS_APPEND_FL) xflags |= XFS_XFLAG_APPEND; else xflags &= ~XFS_XFLAG_APPEND; - if (flags & LINUX_XFLAG_SYNC) + if (flags & FS_SYNC_FL) xflags |= XFS_XFLAG_SYNC; else xflags &= ~XFS_XFLAG_SYNC; - if (flags & LINUX_XFLAG_NOATIME) + if (flags & FS_NOATIME_FL) xflags |= XFS_XFLAG_NOATIME; else xflags &= ~XFS_XFLAG_NOATIME; - if (flags & LINUX_XFLAG_NODUMP) + if (flags & FS_NODUMP_FL) xflags |= XFS_XFLAG_NODUMP; else xflags &= ~XFS_XFLAG_NODUMP; @@ -1134,156 +832,431 @@ xfs_di2lxflags( unsigned int flags = 0; if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= LINUX_XFLAG_IMMUTABLE; + flags |= FS_IMMUTABLE_FL; if (di_flags & XFS_DIFLAG_APPEND) - flags |= LINUX_XFLAG_APPEND; + flags |= FS_APPEND_FL; if (di_flags & XFS_DIFLAG_SYNC) - flags |= LINUX_XFLAG_SYNC; + flags |= FS_SYNC_FL; if (di_flags & XFS_DIFLAG_NOATIME) - flags |= LINUX_XFLAG_NOATIME; + flags |= FS_NOATIME_FL; if (di_flags & XFS_DIFLAG_NODUMP) - flags |= LINUX_XFLAG_NODUMP; + flags |= FS_NODUMP_FL; return flags; } STATIC int -xfs_ioc_xattr( - bhv_vnode_t *vp, +xfs_ioc_fsgetxattr( xfs_inode_t *ip, - struct file *filp, - unsigned int cmd, + int attr, void __user *arg) { struct fsxattr fa; - struct bhv_vattr *vattr; - int error = 0; - int attr_flags; - unsigned int flags; - - vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); - if (unlikely(!vattr)) - return -ENOMEM; - switch (cmd) { - case XFS_IOC_FSGETXATTR: { - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \ - XFS_AT_NEXTENTS | XFS_AT_PROJID; - error = bhv_vop_getattr(vp, vattr, 0, NULL); - if (unlikely(error)) { - error = -error; - break; - } + xfs_ilock(ip, XFS_ILOCK_SHARED); + fa.fsx_xflags = xfs_ip2xflags(ip); + fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_projid = ip->i_d.di_projid; + + if (attr) { + if (ip->i_afp) { + if (ip->i_afp->if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_afp->if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_anextents; + } else + fa.fsx_nextents = 0; + } else { + if (ip->i_df.if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_df.if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_nextents; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); - fa.fsx_xflags = vattr->va_xflags; - fa.fsx_extsize = vattr->va_extsize; - fa.fsx_nextents = vattr->va_nextents; - fa.fsx_projid = vattr->va_projid; + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} - if (copy_to_user(arg, &fa, sizeof(fa))) { - error = -EFAULT; - break; - } - break; +STATIC void +xfs_set_diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + unsigned int di_flags; + + /* can't set PREALLOC this way, just preserve it */ + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & XFS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + if (xflags & XFS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & XFS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (xflags & XFS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & XFS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & XFS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (xflags & XFS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & XFS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; } - case XFS_IOC_FSSETXATTR: { - if (copy_from_user(&fa, arg, sizeof(fa))) { - error = -EFAULT; - break; - } + ip->i_d.di_flags = di_flags; +} + +STATIC void +xfs_diflags_to_linux( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); - attr_flags = 0; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; + if (xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID; - vattr->va_xflags = fa.fsx_xflags; - vattr->va_extsize = fa.fsx_extsize; - vattr->va_projid = fa.fsx_projid; +#define FSX_PROJID 1 +#define FSX_EXTSIZE 2 +#define FSX_XFLAGS 4 +#define FSX_NONBLOCK 8 - error = bhv_vop_setattr(vp, vattr, attr_flags, NULL); - if (likely(!error)) - __vn_revalidate(vp, vattr); /* update flags */ - error = -error; - break; +STATIC int +xfs_ioctl_setattr( + xfs_inode_t *ip, + struct fsxattr *fa, + int mask) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int lock_flags = 0; + struct xfs_dquot *udqp = NULL, *gdqp = NULL; + struct xfs_dquot *olddquot = NULL; + int code; + + xfs_itrace_entry(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { + code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid, + ip->i_d.di_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, &gdqp); + if (code) + return code; } - case XFS_IOC_FSGETXATTRA: { - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \ - XFS_AT_ANEXTENTS | XFS_AT_PROJID; - error = bhv_vop_getattr(vp, vattr, 0, NULL); - if (unlikely(error)) { - error = -error; - break; - } + /* + * For the other attributes, we acquire the inode lock and + * first do an error checking pass. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (code) + goto error_return; - fa.fsx_xflags = vattr->va_xflags; - fa.fsx_extsize = vattr->va_extsize; - fa.fsx_nextents = vattr->va_anextents; - fa.fsx_projid = vattr->va_projid; + lock_flags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_flags); - if (copy_to_user(arg, &fa, sizeof(fa))) { - error = -EFAULT; - break; + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal + * to the file owner ID, except in cases where the + * CAP_FSETID capability is applicable. + */ + if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Do a quota reservation only if projid is actually going to change. + */ + if (mask & FSX_PROJID) { + if (XFS_IS_PQUOTA_ON(mp) && + ip->i_d.di_projid != fa->fsx_projid) { + ASSERT(tp); + code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, + capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_return; } - break; } - case XFS_IOC_GETXFLAGS: { - flags = xfs_di2lxflags(ip->i_d.di_flags); - if (copy_to_user(arg, &flags, sizeof(flags))) - error = -EFAULT; - break; + if (mask & FSX_EXTSIZE) { + /* + * Can't change extent size if any extents are allocated. + */ + if (ip->i_d.di_nextents && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != + fa->fsx_extsize)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * Extent size must be a multiple of the appropriate block + * size, if set at all. + */ + if (fa->fsx_extsize != 0) { + xfs_extlen_t size; + + if (XFS_IS_REALTIME_INODE(ip) || + ((mask & FSX_XFLAGS) && + (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { + size = mp->m_sb.sb_rextsize << + mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + } + + if (fa->fsx_extsize % size) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } } - case XFS_IOC_SETXFLAGS: { - if (copy_from_user(&flags, arg, sizeof(flags))) { - error = -EFAULT; - break; + + if (mask & FSX_XFLAGS) { + /* + * Can't change realtime flag if any extents are allocated. + */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + (XFS_IS_REALTIME_INODE(ip)) != + (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; } - if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \ - LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \ - LINUX_XFLAG_SYNC)) { - error = -EOPNOTSUPP; - break; + /* + * If realtime flag is set then must have realtime data. + */ + if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + if ((mp->m_sb.sb_rblocks == 0) || + (mp->m_sb.sb_rextsize == 0) || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { + code = XFS_ERROR(EINVAL); + goto error_return; + } } - attr_flags = 0; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if ((ip->i_d.di_flags & + (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || + (fa->fsx_xflags & + (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + } - vattr->va_mask = XFS_AT_XFLAGS; - vattr->va_xflags = xfs_merge_ioc_xflags(flags, - xfs_ip2xflags(ip)); + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ihold(tp, ip); + + /* + * Change file ownership. Must be the owner or privileged. + * If the system was configured with the "restricted_chown" + * option, the owner is not permitted to give away the file, + * and can change the group id only to a group of which he + * or she is a member. + */ + if (mask & FSX_PROJID) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (ip->i_d.di_projid != fa->fsx_projid) { + if (XFS_IS_PQUOTA_ON(mp)) { + olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_projid = fa->fsx_projid; + + /* + * We may have to rev the inode as well as + * the superblock version number since projids didn't + * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. + */ + if (ip->i_d.di_version == XFS_DINODE_VERSION_1) + xfs_bump_ino_vers2(tp, ip); + } - error = bhv_vop_setattr(vp, vattr, attr_flags, NULL); - if (likely(!error)) - __vn_revalidate(vp, vattr); /* update flags */ - error = -error; - break; } - case XFS_IOC_GETVERSION: { - flags = vn_to_inode(vp)->i_generation; - if (copy_to_user(arg, &flags, sizeof(flags))) - error = -EFAULT; - break; + if (mask & FSX_EXTSIZE) + ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + if (mask & FSX_XFLAGS) { + xfs_set_diflags(ip, fa->fsx_xflags); + xfs_diflags_to_linux(ip); } - default: - error = -ENOTTY; - break; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + + XFS_STATS_INC(xs_ig_attrchg); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + * This is slightly sub-optimal in that truncates require + * two sync transactions instead of one for wsync filesystems. + * One for the truncate and one for the timestamps since we + * don't want to change the timestamps unless we're sure the + * truncate worked. Truncates are less than 1% of the laddis + * mix so this probably isn't worth the trouble to optimize. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + code = xfs_trans_commit(tp, 0); + xfs_iunlock(ip, lock_flags); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + XFS_QM_DQRELE(mp, olddquot); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + if (code) + return code; + + if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) { + XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, + (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0); } - kfree(vattr); - return error; + return 0; + + error_return: + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + xfs_trans_cancel(tp, 0); + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return code; } STATIC int -xfs_ioc_getbmap( - bhv_desc_t *bdp, +xfs_ioc_fssetxattr( + xfs_inode_t *ip, struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + unsigned int mask; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + mask |= FSX_NONBLOCK; + + return -xfs_ioctl_setattr(ip, &fa, mask); +} + +STATIC int +xfs_ioc_getxflags( + xfs_inode_t *ip, + void __user *arg) +{ + unsigned int flags; + + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_setxflags( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + unsigned int flags; + unsigned int mask; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL)) + return -EOPNOTSUPP; + + mask = FSX_XFLAGS; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + mask |= FSX_NONBLOCK; + fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); + + return -xfs_ioctl_setattr(ip, &fa, mask); +} + +STATIC int +xfs_ioc_getbmap( + struct xfs_inode *ip, int ioflags, unsigned int cmd, void __user *arg) @@ -1302,7 +1275,7 @@ xfs_ioc_getbmap( if (ioflags & IO_INVIS) iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags); + error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags); if (error) return -error; @@ -1313,7 +1286,7 @@ xfs_ioc_getbmap( STATIC int xfs_ioc_getbmapx( - bhv_desc_t *bdp, + struct xfs_inode *ip, void __user *arg) { struct getbmapx bmx; @@ -1340,7 +1313,7 @@ xfs_ioc_getbmapx( iflags |= BMV_IF_EXTENDED; - error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags); + error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags); if (error) return -error; @@ -1351,3 +1324,259 @@ xfs_ioc_getbmapx( return 0; } + +int +xfs_ioctl( + xfs_inode_t *ip, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + xfs_mount_t *mp = ip->i_mount; + int error; + + xfs_itrace_entry(XFS_I(inode)); + switch (cmd) { + + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!xfs_sb_version_hasextflgbit(&mp->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg); + + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); + + if (copy_to_user(arg, &da, sizeof(da))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *)arg); + + case XFS_IOC_FSGETXATTR: + return xfs_ioc_fsgetxattr(ip, 0, arg); + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -XFS_ERROR(EFAULT); + + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, + dmi.fsd_dmstate); + return -error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(ip, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: + return xfs_find_handle(cmd, arg); + + case XFS_IOC_OPEN_BY_HANDLE: + return xfs_open_by_handle(mp, arg, filp, inode); + + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(mp, arg, inode); + + case XFS_IOC_READLINK_BY_HANDLE: + return xfs_readlink_by_handle(mp, arg, inode); + + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(mp, arg, inode); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(mp, arg, filp, inode); + + case XFS_IOC_SWAPEXT: { + error = xfs_swapext((struct xfs_swapext __user *)arg); + return -error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -XFS_ERROR(EFAULT); + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + if (error) + return -error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_data(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_log(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_rt(mp, &in); + return -error; + } + + case XFS_IOC_FREEZE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (inode->i_sb->s_frozen == SB_UNFROZEN) + freeze_bdev(inode->i_sb->s_bdev); + return 0; + + case XFS_IOC_THAW: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (inode->i_sb->s_frozen != SB_UNFROZEN) + thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); + return 0; + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -XFS_ERROR(EFAULT); + + error = xfs_fs_goingdown(mp, in); + return -error; + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_errortag_add(in.errtag, mp); + return -error; + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_errortag_clearall(mp, 1); + return -error; + + default: + return -ENOTTY; + } +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 270db0f..a4b254e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -23,17 +23,35 @@ #include <linux/fs.h> #include <asm/uaccess.h> #include "xfs.h" -#include "xfs_types.h" #include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir2_sf.h" #include "xfs_vfs.h" #include "xfs_vnode.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_error.h" #include "xfs_dfrag.h" +#include "xfs_vnodeops.h" +#include "xfs_ioctl32.h" #define _NATIVE_IOC(cmd, type) \ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) #define BROKEN_X86_ALIGNMENT +#define _PACKED __attribute__((packed)) /* on ia32 l_start is on a 32-bit boundary */ typedef struct xfs_flock64_32 { __s16 l_type; @@ -75,35 +93,279 @@ xfs_ioctl32_flock( return (unsigned long)p; } +typedef struct compat_xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; + +#define XFS_IOC_FSGEOMETRY_V1_32 \ + _IOR ('X', 100, struct compat_xfs_fsop_geom_v1) + +STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg) +{ + compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg; + xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p)); + + if (copy_in_user(p, p32, sizeof(*p32))) + return -EFAULT; + return (unsigned long)p; +} + +typedef struct compat_xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} __attribute__((packed)) compat_xfs_inogrp_t; + +STATIC int xfs_inumbers_fmt_compat( + void __user *ubuffer, + const xfs_inogrp_t *buffer, + long count, + long *written) +{ + compat_xfs_inogrp_t __user *p32 = ubuffer; + long i; + + for (i = 0; i < count; i++) { + if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || + put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || + put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) + return -EFAULT; + } + *written = count * sizeof(*p32); + return 0; +} + #else -typedef struct xfs_fsop_bulkreq32 { +#define xfs_inumbers_fmt_compat xfs_inumbers_fmt +#define _PACKED + +#endif + +/* XFS_IOC_FSBULKSTAT and friends */ + +typedef struct compat_xfs_bstime { + __s32 tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} compat_xfs_bstime_t; + +STATIC int xfs_bstime_store_compat( + compat_xfs_bstime_t __user *p32, + const xfs_bstime_t *p) +{ + __s32 sec32; + + sec32 = p->tv_sec; + if (put_user(sec32, &p32->tv_sec) || + put_user(p->tv_nsec, &p32->tv_nsec)) + return -EFAULT; + return 0; +} + +typedef struct compat_xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + compat_xfs_bstime_t bs_atime; /* access time */ + compat_xfs_bstime_t bs_mtime; /* modify time */ + compat_xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid; /* project id */ + unsigned char bs_pad[14]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} _PACKED compat_xfs_bstat_t; + +STATIC int xfs_bulkstat_one_fmt_compat( + void __user *ubuffer, + const xfs_bstat_t *buffer) +{ + compat_xfs_bstat_t __user *p32 = ubuffer; + + if (put_user(buffer->bs_ino, &p32->bs_ino) || + put_user(buffer->bs_mode, &p32->bs_mode) || + put_user(buffer->bs_nlink, &p32->bs_nlink) || + put_user(buffer->bs_uid, &p32->bs_uid) || + put_user(buffer->bs_gid, &p32->bs_gid) || + put_user(buffer->bs_rdev, &p32->bs_rdev) || + put_user(buffer->bs_blksize, &p32->bs_blksize) || + put_user(buffer->bs_size, &p32->bs_size) || + xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || + xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || + xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || + put_user(buffer->bs_blocks, &p32->bs_blocks) || + put_user(buffer->bs_xflags, &p32->bs_xflags) || + put_user(buffer->bs_extsize, &p32->bs_extsize) || + put_user(buffer->bs_extents, &p32->bs_extents) || + put_user(buffer->bs_gen, &p32->bs_gen) || + put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || + put_user(buffer->bs_dmstate, &p32->bs_dmstate) || + put_user(buffer->bs_aextents, &p32->bs_aextents)) + return -EFAULT; + return sizeof(*p32); +} + + + +typedef struct compat_xfs_fsop_bulkreq { compat_uptr_t lastip; /* last inode # pointer */ __s32 icount; /* count of entries in buffer */ compat_uptr_t ubuffer; /* user buffer for inode desc. */ - __s32 ocount; /* output count pointer */ -} xfs_fsop_bulkreq32_t; + compat_uptr_t ocount; /* output count pointer */ +} compat_xfs_fsop_bulkreq_t; -STATIC unsigned long -xfs_ioctl32_bulkstat( - unsigned long arg) +#define XFS_IOC_FSBULKSTAT_32 \ + _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ + _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS_32 \ + _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) + +/* copied from xfs_ioctl.c */ +STATIC int +xfs_ioc_bulkstat_compat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) { - xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg; - xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p)); + compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg; u32 addr; + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); - if (get_user(addr, &p32->lastip) || - put_user(compat_ptr(addr), &p->lastip) || - copy_in_user(&p->icount, &p32->icount, sizeof(s32)) || - get_user(addr, &p32->ubuffer) || - put_user(compat_ptr(addr), &p->ubuffer) || - get_user(addr, &p32->ocount) || - put_user(compat_ptr(addr), &p->ocount)) + if (get_user(addr, &p32->lastip)) + return -EFAULT; + bulkreq.lastip = compat_ptr(addr); + if (get_user(bulkreq.icount, &p32->icount) || + get_user(addr, &p32->ubuffer)) + return -EFAULT; + bulkreq.ubuffer = compat_ptr(addr); + if (get_user(addr, &p32->ocount)) + return -EFAULT; + bulkreq.ocount = compat_ptr(addr); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt_compat); + else { + /* declare a var to get a warning in case the type changes */ + bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat; + error = xfs_bulkstat(mp, &inlast, &count, + xfs_bulkstat_one, formatter, + sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, + BULKSTAT_FG_QUICK, &done); + } + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + + + +typedef struct compat_xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + compat_uptr_t path; /* user pathname */ + __u32 oflags; /* open flags */ + compat_uptr_t ihandle; /* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + compat_uptr_t ohandle; /* user buffer for handle */ + compat_uptr_t ohandlen; /* user buffer length */ +} compat_xfs_fsop_handlereq_t; + +#define XFS_IOC_PATH_TO_FSHANDLE_32 \ + _IOWR('X', 104, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE_32 \ + _IOWR('X', 105, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE_32 \ + _IOWR('X', 106, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE_32 \ + _IOWR('X', 107, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE_32 \ + _IOWR('X', 108, struct compat_xfs_fsop_handlereq) + +STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg) +{ + compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg; + xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p)); + u32 addr; + + if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) || + get_user(addr, &p32->path) || + put_user(compat_ptr(addr), &p->path) || + copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) || + get_user(addr, &p32->ihandle) || + put_user(compat_ptr(addr), &p->ihandle) || + copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) || + get_user(addr, &p32->ohandle) || + put_user(compat_ptr(addr), &p->ohandle) || + get_user(addr, &p32->ohandlen) || + put_user(compat_ptr(addr), &p->ohandlen)) return -EFAULT; return (unsigned long)p; } -#endif + STATIC long xfs_compat_ioctl( @@ -112,17 +374,12 @@ xfs_compat_ioctl( unsigned cmd, unsigned long arg) { - struct inode *inode = file->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); + struct inode *inode = file->f_path.dentry->d_inode; int error; switch (cmd) { case XFS_IOC_DIOINFO: - case XFS_IOC_FSGEOMETRY_V1: case XFS_IOC_FSGEOMETRY: - case XFS_IOC_GETVERSION: - case XFS_IOC_GETXFLAGS: - case XFS_IOC_SETXFLAGS: case XFS_IOC_FSGETXATTR: case XFS_IOC_FSSETXATTR: case XFS_IOC_FSGETXATTRA: @@ -131,12 +388,7 @@ xfs_compat_ioctl( case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: /* not handled - case XFS_IOC_FD_TO_HANDLE: - case XFS_IOC_PATH_TO_HANDLE: - case XFS_IOC_PATH_TO_FSHANDLE: - case XFS_IOC_OPEN_BY_HANDLE: case XFS_IOC_FSSETDM_BY_HANDLE: - case XFS_IOC_READLINK_BY_HANDLE: case XFS_IOC_ATTRLIST_BY_HANDLE: case XFS_IOC_ATTRMULTI_BY_HANDLE: */ @@ -153,6 +405,11 @@ xfs_compat_ioctl( case XFS_IOC_ERROR_CLEARALL: break; + case XFS_IOC32_GETXFLAGS: + case XFS_IOC32_SETXFLAGS: + case XFS_IOC32_GETVERSION: + cmd = _NATIVE_IOC(cmd, long); + break; #ifdef BROKEN_X86_ALIGNMENT /* xfs_flock_t has wrong u32 vs u64 alignment */ case XFS_IOC_ALLOCSP_32: @@ -166,6 +423,10 @@ xfs_compat_ioctl( arg = xfs_ioctl32_flock(arg); cmd = _NATIVE_IOC(cmd, struct xfs_flock64); break; + case XFS_IOC_FSGEOMETRY_V1_32: + arg = xfs_ioctl32_geom_v1(arg); + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1); + break; #else /* These are handled fine if no alignment issues */ case XFS_IOC_ALLOCSP: @@ -176,24 +437,34 @@ xfs_compat_ioctl( case XFS_IOC_FREESP64: case XFS_IOC_RESVSP64: case XFS_IOC_UNRESVSP64: + case XFS_IOC_FSGEOMETRY_V1: break; /* xfs_bstat_t still has wrong u32 vs u64 alignment */ case XFS_IOC_SWAPEXT: break; - case XFS_IOC_FSBULKSTAT_SINGLE: - case XFS_IOC_FSBULKSTAT: - case XFS_IOC_FSINUMBERS: - arg = xfs_ioctl32_bulkstat(arg); - break; #endif + case XFS_IOC_FSBULKSTAT_32: + case XFS_IOC_FSBULKSTAT_SINGLE_32: + case XFS_IOC_FSINUMBERS_32: + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq); + return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount, + cmd, (void __user*)arg); + case XFS_IOC_FD_TO_HANDLE_32: + case XFS_IOC_PATH_TO_HANDLE_32: + case XFS_IOC_PATH_TO_FSHANDLE_32: + case XFS_IOC_OPEN_BY_HANDLE_32: + case XFS_IOC_READLINK_BY_HANDLE_32: + arg = xfs_ioctl32_fshandle(arg); + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); + break; default: return -ENOIOCTLCMD; } - error = bhv_vop_ioctl(vp, inode, file, mode, cmd, (void __user *)arg); - VMODIFY(vp); + error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg); + xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED); return error; } diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index d918002..095d271 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -43,32 +43,16 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" +#include "xfs_vnodeops.h" #include <linux/capability.h> #include <linux/xattr.h> #include <linux/namei.h> #include <linux/security.h> - -/* - * Get a XFS inode from a given vnode. - */ -xfs_inode_t * -xfs_vtoi( - bhv_vnode_t *vp) -{ - bhv_desc_t *bdp; - - bdp = bhv_lookup_range(VN_BHV_HEAD(vp), - VNODE_POSITION_XFS, VNODE_POSITION_XFS); - if (unlikely(bdp == NULL)) - return NULL; - return XFS_BHVTOI(bdp); -} +#include <linux/falloc.h> /* * Bring the atime in the XFS inode uptodate. @@ -78,50 +62,58 @@ void xfs_synchronize_atime( xfs_inode_t *ip) { - bhv_vnode_t *vp; + struct inode *inode = VFS_I(ip); - vp = XFS_ITOV_NULL(ip); - if (vp) { - struct inode *inode = &vp->v_inode; + if (inode) { ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; } } /* + * If the linux inode exists, mark it dirty. + * Used when commiting a dirty inode into a transaction so that + * the inode will get written back by the linux code + */ +void +xfs_mark_inode_dirty_sync( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + if (inode) + mark_inode_dirty_sync(inode); +} + +/* * Change the requested timestamp in the given inode. * We don't lock across timestamp updates, and we don't log them but * we do record the fact that there is dirty information in core. - * - * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG - * with XFS_ICHGTIME_ACC to be sure that access time - * update will take. Calling first with XFS_ICHGTIME_ACC - * and then XFS_ICHGTIME_MOD may fail to modify the access - * timestamp if the filesystem is mounted noacctm. */ void xfs_ichgtime( xfs_inode_t *ip, int flags) { - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); + struct inode *inode = VFS_I(ip); timespec_t tv; + int sync_it = 0; - nanotime(&tv); - if (flags & XFS_ICHGTIME_MOD) { + tv = current_fs_time(inode->i_sb); + + if ((flags & XFS_ICHGTIME_MOD) && + !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + sync_it = 1; } - if (flags & XFS_ICHGTIME_ACC) { - inode->i_atime = tv; - ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec; - } - if (flags & XFS_ICHGTIME_CHG) { + if ((flags & XFS_ICHGTIME_CHG) && + !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; + sync_it = 1; } /* @@ -133,81 +125,10 @@ xfs_ichgtime( * ensure that the compiler does not reorder the update * of i_update_core above the timestamp updates above. */ - SYNCHRONIZE(); - ip->i_update_core = 1; - if (!(inode->i_state & I_LOCK)) + if (sync_it) { + SYNCHRONIZE(); + ip->i_update_core = 1; mark_inode_dirty_sync(inode); -} - -/* - * Variant on the above which avoids querying the system clock - * in situations where we know the Linux inode timestamps have - * just been updated (and so we can update our inode cheaply). - */ -void -xfs_ichgtime_fast( - xfs_inode_t *ip, - struct inode *inode, - int flags) -{ - timespec_t *tvp; - - /* - * Atime updates for read() & friends are handled lazily now, and - * explicit updates must go through xfs_ichgtime() - */ - ASSERT((flags & XFS_ICHGTIME_ACC) == 0); - - /* - * We're not supposed to change timestamps in readonly-mounted - * filesystems. Throw it away if anyone asks us. - */ - if (unlikely(IS_RDONLY(inode))) - return; - - if (flags & XFS_ICHGTIME_MOD) { - tvp = &inode->i_mtime; - ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec; - } - if (flags & XFS_ICHGTIME_CHG) { - tvp = &inode->i_ctime; - ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec; - } - - /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. - */ - SYNCHRONIZE(); - ip->i_update_core = 1; - if (!(inode->i_state & I_LOCK)) - mark_inode_dirty_sync(inode); -} - - -/* - * Pull the link count and size up from the xfs inode to the linux inode - */ -STATIC void -xfs_validate_fields( - struct inode *ip, - bhv_vattr_t *vattr) -{ - vattr->va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS; - if (!bhv_vop_getattr(vn_from_inode(ip), vattr, ATTR_LAZY, NULL)) { - ip->i_nlink = vattr->va_nlink; - ip->i_blocks = vattr->va_nblocks; - - /* we're under i_sem so i_size can't change under us */ - if (i_size_read(ip) != vattr->va_size) - i_size_write(ip, vattr->va_size); } } @@ -219,65 +140,58 @@ xfs_validate_fields( */ STATIC int xfs_init_security( - bhv_vnode_t *vp, + struct inode *inode, struct inode *dir) { - struct inode *ip = vn_to_inode(vp); + struct xfs_inode *ip = XFS_I(inode); size_t length; void *value; char *name; int error; - error = security_inode_init_security(ip, dir, &name, &value, &length); + error = security_inode_init_security(inode, dir, &name, + &value, &length); if (error) { if (error == -EOPNOTSUPP) return 0; return -error; } - error = bhv_vop_attr_set(vp, name, value, length, ATTR_SECURE, NULL); + error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); if (!error) - VMODIFY(vp); + xfs_iflags_set(ip, XFS_IMODIFIED); kfree(name); kfree(value); return error; } -/* - * Determine whether a process has a valid fs_struct (kernel daemons - * like knfsd don't have an fs_struct). - * - * XXX(hch): nfsd is broken, better fix it instead. - */ -STATIC inline int -xfs_has_fs_struct(struct task_struct *task) +static void +xfs_dentry_to_name( + struct xfs_name *namep, + struct dentry *dentry) { - return (task->fs != init_task.fs); + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; } -STATIC inline void +STATIC void xfs_cleanup_inode( - bhv_vnode_t *dvp, - bhv_vnode_t *vp, - struct dentry *dentry, - int mode) + struct inode *dir, + struct inode *inode, + struct dentry *dentry) { - struct dentry teardown = {}; + struct xfs_name teardown; /* Oh, the horror. * If we can't add the ACL or we fail in * xfs_init_security we must back out. * ENOSPC can hit here, among other things. */ - teardown.d_inode = vn_to_inode(vp); - teardown.d_name = dentry->d_name; + xfs_dentry_to_name(&teardown, dentry); - if (S_ISDIR(mode)) - bhv_vop_rmdir(dvp, &teardown, NULL); - else - bhv_vop_remove(dvp, &teardown, NULL); - VN_RELE(vp); + xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); + iput(inode); } STATIC int @@ -287,11 +201,11 @@ xfs_vn_mknod( int mode, dev_t rdev) { - struct inode *ip; - bhv_vattr_t vattr = { 0 }; - bhv_vnode_t *vp = NULL, *dvp = vn_from_inode(dir); + struct inode *inode; + struct xfs_inode *ip = NULL; xfs_acl_t *default_acl = NULL; - attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; + struct xfs_name name; + int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS; int error; /* @@ -301,66 +215,64 @@ xfs_vn_mknod( if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; - if (unlikely(test_default_acl && test_default_acl(dvp))) { + if (test_default_acl && test_default_acl(dir)) { if (!_ACL_ALLOC(default_acl)) { return -ENOMEM; } - if (!_ACL_GET_DEFAULT(dvp, default_acl)) { + if (!_ACL_GET_DEFAULT(dir, default_acl)) { _ACL_FREE(default_acl); default_acl = NULL; } } - if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current)) - mode &= ~current->fs->umask; + xfs_dentry_to_name(&name, dentry); - vattr.va_mask = XFS_AT_TYPE|XFS_AT_MODE; - vattr.va_mode = mode; + if (IS_POSIXACL(dir) && !default_acl) + mode &= ~current->fs->umask; switch (mode & S_IFMT) { - case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - vattr.va_rdev = sysv_encode_dev(rdev); - vattr.va_mask |= XFS_AT_RDEV; - /*FALLTHROUGH*/ + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + rdev = sysv_encode_dev(rdev); case S_IFREG: - error = bhv_vop_create(dvp, dentry, &vattr, &vp, NULL); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); break; case S_IFDIR: - error = bhv_vop_mkdir(dvp, dentry, &vattr, &vp, NULL); + error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL); break; default: error = EINVAL; break; } - if (unlikely(!error)) { - error = xfs_init_security(vp, dir); - if (error) - xfs_cleanup_inode(dvp, vp, dentry, mode); - } + if (unlikely(error)) + goto out_free_acl; - if (unlikely(default_acl)) { - if (!error) { - error = _ACL_INHERIT(vp, &vattr, default_acl); - if (!error) - VMODIFY(vp); - else - xfs_cleanup_inode(dvp, vp, dentry, mode); - } + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir); + if (unlikely(error)) + goto out_cleanup_inode; + + if (default_acl) { + error = _ACL_INHERIT(inode, mode, default_acl); + if (unlikely(error)) + goto out_cleanup_inode; + xfs_iflags_set(ip, XFS_IMODIFIED); _ACL_FREE(default_acl); } - if (likely(!error)) { - ASSERT(vp); - ip = vn_to_inode(vp); - if (S_ISCHR(mode) || S_ISBLK(mode)) - ip->i_rdev = rdev; - else if (S_ISDIR(mode)) - xfs_validate_fields(ip, &vattr); - d_instantiate(dentry, ip); - xfs_validate_fields(dir, &vattr); - } + d_instantiate(dentry, inode); + return -error; + + out_cleanup_inode: + xfs_cleanup_inode(dir, inode, dentry); + out_free_acl: + if (default_acl) + _ACL_FREE(default_acl); return -error; } @@ -389,13 +301,15 @@ xfs_vn_lookup( struct dentry *dentry, struct nameidata *nd) { - bhv_vnode_t *vp = vn_from_inode(dir), *cvp; + struct xfs_inode *cip; + struct xfs_name name; int error; if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - error = bhv_vop_lookup(vp, dentry, &cvp, 0, NULL, NULL); + xfs_dentry_to_name(&name, dentry); + error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); if (unlikely(error)) { if (unlikely(error != ENOENT)) return ERR_PTR(-error); @@ -403,7 +317,47 @@ xfs_vn_lookup( return NULL; } - return d_splice_alias(vn_to_inode(cvp), dentry); + return d_splice_alias(VFS_I(cip), dentry); +} + +STATIC struct dentry * +xfs_vn_ci_lookup( + struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct xfs_inode *ip; + struct xfs_name xname; + struct xfs_name ci_name; + struct qstr dname; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&xname, dentry); + error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); + if (unlikely(error)) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + /* + * call d_add(dentry, NULL) here when d_drop_negative_children + * is called in xfs_vn_mknod (ie. allow negative dentries + * with CI filesystems). + */ + return NULL; + } + + /* if exact match, just splice and exit */ + if (!ci_name.name) + return d_splice_alias(VFS_I(ip), dentry); + + /* else case-insensitive match... */ + dname.name = ci_name.name; + dname.len = ci_name.len; + dentry = d_add_ci(dentry, VFS_I(ip), &dname); + kmem_free(ci_name.name); + return dentry; } STATIC int @@ -412,26 +366,23 @@ xfs_vn_link( struct inode *dir, struct dentry *dentry) { - struct inode *ip; /* inode of guy being linked to */ - bhv_vnode_t *tdvp; /* target directory for new name/link */ - bhv_vnode_t *vp; /* vp of name being linked */ - bhv_vattr_t vattr; + struct inode *inode; /* inode of guy being linked to */ + struct xfs_name name; int error; - ip = old_dentry->d_inode; /* inode being linked to */ - tdvp = vn_from_inode(dir); - vp = vn_from_inode(ip); + inode = old_dentry->d_inode; + xfs_dentry_to_name(&name, dentry); - VN_HOLD(vp); - error = bhv_vop_link(tdvp, vp, dentry, NULL); + igrab(inode); + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) { - VN_RELE(vp); - } else { - VMODIFY(tdvp); - xfs_validate_fields(ip, &vattr); - d_instantiate(dentry, ip); + iput(inode); + return -error; } - return -error; + + xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED); + d_instantiate(dentry, inode); + return 0; } STATIC int @@ -439,20 +390,23 @@ xfs_vn_unlink( struct inode *dir, struct dentry *dentry) { - struct inode *inode; - bhv_vnode_t *dvp; /* directory containing name to remove */ - bhv_vattr_t vattr; + struct xfs_name name; int error; - inode = dentry->d_inode; - dvp = vn_from_inode(dir); + xfs_dentry_to_name(&name, dentry); - error = bhv_vop_remove(dvp, dentry, NULL); - if (likely(!error)) { - xfs_validate_fields(dir, &vattr); /* size needs update */ - xfs_validate_fields(inode, &vattr); - } - return -error; + error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + if (error) + return error; + + /* + * With unlink, the VFS makes the dentry "negative": no inode, + * but still hashed. This is incompatible with case-insensitive + * mode, so invalidate (unhash) the dentry in CI-mode. + */ + if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + d_invalidate(dentry); + return 0; } STATIC int @@ -461,49 +415,32 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *ip; - bhv_vattr_t va = { 0 }; - bhv_vnode_t *dvp; /* directory containing name of symlink */ - bhv_vnode_t *cvp; /* used to lookup symlink to put in dentry */ + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; int error; + mode_t mode; - dvp = vn_from_inode(dir); - cvp = NULL; - - va.va_mode = S_IFLNK | + mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); - va.va_mask = XFS_AT_TYPE|XFS_AT_MODE; - - error = bhv_vop_symlink(dvp, dentry, &va, (char *)symname, &cvp, NULL); - if (likely(!error && cvp)) { - error = xfs_init_security(cvp, dir); - if (likely(!error)) { - ip = vn_to_inode(cvp); - d_instantiate(dentry, ip); - xfs_validate_fields(dir, &va); - xfs_validate_fields(ip, &va); - } else { - xfs_cleanup_inode(dvp, cvp, dentry, 0); - } - } - return -error; -} + xfs_dentry_to_name(&name, dentry); -STATIC int -xfs_vn_rmdir( - struct inode *dir, - struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - bhv_vnode_t *dvp = vn_from_inode(dir); - bhv_vattr_t vattr; - int error; + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); + if (unlikely(error)) + goto out; - error = bhv_vop_rmdir(dvp, dentry, NULL); - if (likely(!error)) { - xfs_validate_fields(inode, &vattr); - xfs_validate_fields(dir, &vattr); - } + inode = VFS_I(cip); + + error = xfs_init_security(inode, dir); + if (unlikely(error)) + goto out_cleanup_inode; + + d_instantiate(dentry, inode); + return 0; + + out_cleanup_inode: + xfs_cleanup_inode(dir, inode, dentry); + out: return -error; } @@ -515,23 +452,15 @@ xfs_vn_rename( struct dentry *ndentry) { struct inode *new_inode = ndentry->d_inode; - bhv_vnode_t *fvp; /* from directory */ - bhv_vnode_t *tvp; /* target directory */ - bhv_vattr_t vattr; - int error; + struct xfs_name oname; + struct xfs_name nname; - fvp = vn_from_inode(odir); - tvp = vn_from_inode(ndir); + xfs_dentry_to_name(&oname, odentry); + xfs_dentry_to_name(&nname, ndentry); - error = bhv_vop_rename(fvp, odentry, tvp, ndentry, NULL); - if (likely(!error)) { - if (new_inode) - xfs_validate_fields(new_inode, &vattr); - xfs_validate_fields(odir, &vattr); - if (ndir != odir) - xfs_validate_fields(ndir, &vattr); - } - return -error; + return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + XFS_I(ndir), &nname, new_inode ? + XFS_I(new_inode) : NULL); } /* @@ -544,50 +473,25 @@ xfs_vn_follow_link( struct dentry *dentry, struct nameidata *nd) { - bhv_vnode_t *vp; - uio_t *uio; - iovec_t iov; - int error; char *link; + int error = -ENOMEM; - ASSERT(dentry); - ASSERT(nd); + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) + goto out_err; - link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) { - nd_set_link(nd, ERR_PTR(-ENOMEM)); - return NULL; - } - - uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL); - if (!uio) { - kfree(link); - nd_set_link(nd, ERR_PTR(-ENOMEM)); - return NULL; - } - - vp = vn_from_inode(dentry->d_inode); - - iov.iov_base = link; - iov.iov_len = MAXPATHLEN; - - uio->uio_iov = &iov; - uio->uio_offset = 0; - uio->uio_segflg = UIO_SYSSPACE; - uio->uio_resid = MAXPATHLEN; - uio->uio_iovcnt = 1; - - error = bhv_vop_readlink(vp, uio, 0, NULL); - if (unlikely(error)) { - kfree(link); - link = ERR_PTR(-error); - } else { - link[MAXPATHLEN - uio->uio_resid] = '\0'; - } - kfree(uio); + error = -xfs_readlink(XFS_I(dentry->d_inode), link); + if (unlikely(error)) + goto out_kfree; nd_set_link(nd, link); return NULL; + + out_kfree: + kfree(link); + out_err: + nd_set_link(nd, ERR_PTR(error)); + return NULL; } STATIC void @@ -604,12 +508,30 @@ xfs_vn_put_link( #ifdef CONFIG_XFS_POSIX_ACL STATIC int +xfs_check_acl( + struct inode *inode, + int mask) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + xfs_itrace_entry(ip); + + if (XFS_IFORK_Q(ip)) { + error = xfs_acl_iaccess(ip, mask, NULL); + if (error != -1) + return -error; + } + + return -EAGAIN; +} + +STATIC int xfs_vn_permission( - struct inode *inode, - int mode, - struct nameidata *nd) + struct inode *inode, + int mask) { - return -bhv_vop_access(vn_from_inode(inode), mode << 6, NULL); + return generic_permission(inode, mask, xfs_check_acl); } #else #define xfs_vn_permission NULL @@ -617,228 +539,294 @@ xfs_vn_permission( STATIC int xfs_vn_getattr( - struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *stat) + struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) { - struct inode *inode = dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - int error = 0; + struct inode *inode = dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + xfs_itrace_entry(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + stat->size = XFS_ISIZE(ip); + stat->dev = inode->i_sb->s_dev; + stat->mode = ip->i_d.di_mode; + stat->nlink = ip->i_d.di_nlink; + stat->uid = ip->i_d.di_uid; + stat->gid = ip->i_d.di_gid; + stat->ino = ip->i_ino; +#if XFS_BIG_INUMS + stat->ino += mp->m_inoadd; +#endif + stat->atime = inode->i_atime; + stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; + stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec; + stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + stat->blocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + stat->blksize = BLKDEV_IOSIZE; + stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * If the file blocks are being allocated from a + * realtime volume, then return the inode's realtime + * extent size or the realtime volume's extent size. + */ + stat->blksize = + xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + } else + stat->blksize = xfs_preferred_iosize(mp); + stat->rdev = 0; + break; + } - if (unlikely(vp->v_flag & VMODIFIED)) - error = vn_revalidate(vp); - if (!error) - generic_fillattr(inode, stat); - return -error; + return 0; } STATIC int xfs_vn_setattr( struct dentry *dentry, - struct iattr *attr) + struct iattr *iattr) { - struct inode *inode = dentry->d_inode; - unsigned int ia_valid = attr->ia_valid; - bhv_vnode_t *vp = vn_from_inode(inode); - bhv_vattr_t vattr = { 0 }; - int flags = 0; - int error; - - if (ia_valid & ATTR_UID) { - vattr.va_mask |= XFS_AT_UID; - vattr.va_uid = attr->ia_uid; - } - if (ia_valid & ATTR_GID) { - vattr.va_mask |= XFS_AT_GID; - vattr.va_gid = attr->ia_gid; - } - if (ia_valid & ATTR_SIZE) { - vattr.va_mask |= XFS_AT_SIZE; - vattr.va_size = attr->ia_size; - } - if (ia_valid & ATTR_ATIME) { - vattr.va_mask |= XFS_AT_ATIME; - vattr.va_atime = attr->ia_atime; - inode->i_atime = attr->ia_atime; - } - if (ia_valid & ATTR_MTIME) { - vattr.va_mask |= XFS_AT_MTIME; - vattr.va_mtime = attr->ia_mtime; - } - if (ia_valid & ATTR_CTIME) { - vattr.va_mask |= XFS_AT_CTIME; - vattr.va_ctime = attr->ia_ctime; - } - if (ia_valid & ATTR_MODE) { - vattr.va_mask |= XFS_AT_MODE; - vattr.va_mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - inode->i_mode &= ~S_ISGID; - } - - if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) - flags |= ATTR_UTIME; -#ifdef ATTR_NO_BLOCK - if ((ia_valid & ATTR_NO_BLOCK)) - flags |= ATTR_NONBLOCK; -#endif - - error = bhv_vop_setattr(vp, &vattr, flags, NULL); - if (likely(!error)) - __vn_revalidate(vp, &vattr); - return -error; + return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL); } +/* + * block_truncate_page can return an error, but we can't propagate it + * at all here. Leave a complaint + stack trace in the syslog because + * this could be bad. If it is bad, we need to propagate the error further. + */ STATIC void xfs_vn_truncate( struct inode *inode) { - block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks); -} - -STATIC int -xfs_vn_setxattr( - struct dentry *dentry, - const char *name, - const void *data, - size_t size, - int flags) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - int error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (flags & XATTR_CREATE) - xflags |= ATTR_CREATE; - if (flags & XATTR_REPLACE) - xflags |= ATTR_REPLACE; - xflags |= namesp->attr_flag; - return namesp->attr_set(vp, attr, (void *)data, size, xflags); + int error; + error = block_truncate_page(inode->i_mapping, inode->i_size, + xfs_get_blocks); + WARN_ON(error); } -STATIC ssize_t -xfs_vn_getxattr( - struct dentry *dentry, - const char *name, - void *data, - size_t size) +STATIC long +xfs_vn_fallocate( + struct inode *inode, + int mode, + loff_t offset, + loff_t len) { - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - ssize_t error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (!size) { - xflags |= ATTR_KERNOVAL; - data = NULL; + long error; + loff_t new_size = 0; + xfs_flock64_t bf; + xfs_inode_t *ip = XFS_I(inode); + + /* preallocation on directories not yet supported */ + error = -ENODEV; + if (S_ISDIR(inode->i_mode)) + goto out_error; + + bf.l_whence = 0; + bf.l_start = offset; + bf.l_len = len; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, + 0, NULL, XFS_ATTR_NOLOCK); + if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) + new_size = offset + len; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL); } - xflags |= namesp->attr_flag; - return namesp->attr_get(vp, attr, (void *)data, size, xflags); -} - -STATIC ssize_t -xfs_vn_listxattr( - struct dentry *dentry, - char *data, - size_t size) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - int error, xflags = ATTR_KERNAMELS; - ssize_t result; - - if (!size) - xflags |= ATTR_KERNOVAL; - xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS; - - error = attr_generic_list(vp, data, size, xflags, &result); - if (error < 0) - return error; - return result; -} -STATIC int -xfs_vn_removexattr( - struct dentry *dentry, - const char *name) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - int error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - xflags |= namesp->attr_flag; - return namesp->attr_remove(vp, attr, xflags); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); +out_error: + return error; } - -struct inode_operations xfs_inode_operations = { +static const struct inode_operations xfs_inode_operations = { .permission = xfs_vn_permission, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, + .fallocate = xfs_vn_fallocate, }; -struct inode_operations xfs_dir_inode_operations = { +static const struct inode_operations xfs_dir_inode_operations = { .create = xfs_vn_create, .lookup = xfs_vn_lookup, .link = xfs_vn_link, .unlink = xfs_vn_unlink, .symlink = xfs_vn_symlink, .mkdir = xfs_vn_mkdir, - .rmdir = xfs_vn_rmdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename = xfs_vn_rename, + .permission = xfs_vn_permission, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, +}; + +static const struct inode_operations xfs_dir_ci_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_ci_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .permission = xfs_vn_permission, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, }; -struct inode_operations xfs_symlink_inode_operations = { +static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, .permission = xfs_vn_permission, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, }; + +STATIC void +xfs_diflags_to_iflags( + struct inode *inode, + struct xfs_inode *ip) +{ + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Initialize the Linux inode, set up the operation vectors and + * unlock the inode. + * + * When reading existing inodes from disk this is called directly + * from xfs_iget, when creating a new inode it is called from + * xfs_ialloc after setting up the inode. + */ +void +xfs_setup_inode( + struct xfs_inode *ip) +{ + struct inode *inode = ip->i_vnode; + + inode->i_mode = ip->i_d.di_mode; + inode->i_nlink = ip->i_d.di_nlink; + inode->i_uid = ip->i_d.di_uid; + inode->i_gid = ip->i_d.di_gid; + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = + MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + inode->i_rdev = 0; + break; + } + + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + xfs_diflags_to_iflags(inode, ip); + xfs_iflags_clear(ip, XFS_IMODIFIED); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + break; + case S_IFLNK: + inode->i_op = &xfs_symlink_inode_operations; + if (!(ip->i_df.if_flags & XFS_IFINLINE)) + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } + + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + + unlock_new_inode(inode); +} diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h index ad6173d..8b1a1e3 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -18,19 +18,14 @@ #ifndef __XFS_IOPS_H__ #define __XFS_IOPS_H__ -extern struct inode_operations xfs_inode_operations; -extern struct inode_operations xfs_dir_inode_operations; -extern struct inode_operations xfs_symlink_inode_operations; +struct xfs_inode; extern const struct file_operations xfs_file_operations; extern const struct file_operations xfs_dir_file_operations; extern const struct file_operations xfs_invis_file_operations; -extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *, - int, unsigned int, void __user *); +extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -struct xfs_inode; -extern void xfs_ichgtime(struct xfs_inode *, int); -extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int); +extern void xfs_setup_inode(struct xfs_inode *); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 484dba1..cc0f7b3 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -43,17 +43,15 @@ #include <kmem.h> #include <mrlock.h> -#include <spin.h> #include <sv.h> #include <mutex.h> -#include <sema.h> #include <time.h> #include <support/ktrace.h> #include <support/debug.h> -#include <support/move.h> #include <support/uuid.h> +#include <linux/semaphore.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/blkdev.h> @@ -75,6 +73,10 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/delay.h> +#include <linux/log2.h> +#include <linux/spinlock.h> +#include <linux/random.h> +#include <linux/ctype.h> #include <asm/page.h> #include <asm/div64.h> @@ -83,7 +85,6 @@ #include <asm/byteorder.h> #include <asm/unaligned.h> -#include <xfs_behavior.h> #include <xfs_vfs.h> #include <xfs_cred.h> #include <xfs_vnode.h> @@ -100,9 +101,6 @@ /* * Feature macros (disable/enable) */ -#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ -#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ -#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #else @@ -124,11 +122,10 @@ #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val +#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) -#define current_fsuid(cred) (current->fsuid) -#define current_fsgid(cred) (current->fsgid) #define current_test_flags(f) (current->flags & (f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) @@ -137,50 +134,19 @@ #define current_restore_flags_nested(sp, f) \ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) -#define NBPP PAGE_SIZE -#define DPPSHFT (PAGE_SHIFT - 9) -#define NDPP (1 << (PAGE_SHIFT - 9)) -#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT) -#define dtopt(DD) ((DD) >> DPPSHFT) -#define dpoff(DD) ((DD) & (NDPP-1)) +#define spinlock_destroy(lock) #define NBBY 8 /* number of bits per byte */ -#define NBPC PAGE_SIZE /* Number of bytes per click */ -#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */ /* * Size of block device i/o is parameterized here. * Currently the system supports page-sized i/o. */ -#define BLKDEV_IOSHIFT BPCSHIFT +#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT #define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT) /* number of BB's per block device block */ #define BLKDEV_BB BTOBB(BLKDEV_IOSIZE) -/* bytes to clicks */ -#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) -#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) -#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) -#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT) -#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT) -#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT) - -/* off_t bytes to clicks */ -#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) -#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT) - -/* clicks to off_t bytes */ -#define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT) - -/* clicks to bytes */ -#define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT) -#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) -#define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT) -#define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT) - -/* bytes to clicks */ -#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) - #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ @@ -212,11 +178,7 @@ #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() #define xfs_itruncate_data(ip, off) \ - (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off))) -#define xfs_statvfs_fsid(statp, mp) \ - ({ u64 id = huge_encode_dev((mp)->m_ddev_targp->bt_dev); \ - __kernel_fsid_t *fsid = &(statp)->f_fsid; \ - (fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); }) + (-vmtruncate(VFS_I(ip), (off))) /* Move the kernel do_div definition off to one side */ @@ -329,4 +291,18 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) return(x * y); } +static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x; +} + +/* ARM old ABI has some weird alignment/padding */ +#if defined(__arm__) && !defined(__ARM_EABI__) +#define __arch_pack __attribute__((packed)) +#else +#define __arch_pack +#endif + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index ee788b1..1957e53 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -43,15 +43,15 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_inode_item.h" #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_iomap.h" +#include "xfs_vnodeops.h" #include <linux/capability.h> +#include <linux/mount.h> #include <linux/writeback.h> @@ -59,14 +59,12 @@ void xfs_rw_enter_trace( int tag, - xfs_iocore_t *io, + xfs_inode_t *ip, void *data, size_t segs, loff_t offset, int ioflags) { - xfs_inode_t *ip = XFS_IO_INODE(io); - if (ip->i_rwtrace == NULL) return; ktrace_enter(ip->i_rwtrace, @@ -79,8 +77,8 @@ xfs_rw_enter_trace( (void *)((unsigned long)((offset >> 32) & 0xffffffff)), (void *)((unsigned long)(offset & 0xffffffff)), (void *)((unsigned long)ioflags), - (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(io->io_new_size & 0xffffffff)), + (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_new_size & 0xffffffff)), (void *)((unsigned long)current_pid()), (void *)NULL, (void *)NULL, @@ -90,13 +88,12 @@ xfs_rw_enter_trace( void xfs_inval_cached_trace( - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_off_t offset, xfs_off_t len, xfs_off_t first, xfs_off_t last) { - xfs_inode_t *ip = XFS_IO_INODE(io); if (ip->i_rwtrace == NULL) return; @@ -132,56 +129,38 @@ xfs_inval_cached_trace( */ STATIC int xfs_iozero( - struct inode *ip, /* inode */ + struct xfs_inode *ip, /* inode */ loff_t pos, /* offset in file */ - size_t count, /* size of data to zero */ - loff_t end_size) /* max file size to set */ + size_t count) /* size of data to zero */ { - unsigned bytes; struct page *page; struct address_space *mapping; - char *kaddr; int status; - mapping = ip->i_mapping; + mapping = VFS_I(ip)->i_mapping; do { - unsigned long index, offset; + unsigned offset, bytes; + void *fsdata; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) bytes = count; - status = -ENOMEM; - page = grab_cache_page(mapping, index); - if (!page) + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) break; - kaddr = kmap(page); - status = mapping->a_ops->prepare_write(NULL, page, offset, - offset + bytes); - if (status) { - goto unlock; - } - - memset((void *) (kaddr + offset), 0, bytes); - flush_dcache_page(page); - status = mapping->a_ops->commit_write(NULL, page, offset, - offset + bytes); - if (!status) { - pos += bytes; - count -= bytes; - if (pos > i_size_read(ip)) - i_size_write(ip, pos < end_size ? pos : end_size); - } + zero_user(page, offset, bytes); -unlock: - kunmap(page); - unlock_page(page); - page_cache_release(page); - if (status) - break; + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; } while (count); return (-status); @@ -189,27 +168,21 @@ unlock: ssize_t /* bytes read, or (-) error */ xfs_read( - bhv_desc_t *bdp, + xfs_inode_t *ip, struct kiocb *iocb, const struct iovec *iovp, unsigned int segs, loff_t *offset, - int ioflags, - cred_t *credp) + int ioflags) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + xfs_mount_t *mp = ip->i_mount; size_t size = 0; - ssize_t ret; + ssize_t ret = 0; xfs_fsize_t n; - xfs_inode_t *ip; - xfs_mount_t *mp; - bhv_vnode_t *vp; unsigned long seg; - ip = XFS_BHVTOI(bdp); - vp = BHV_TO_VNODE(bdp); - mp = ip->i_mount; XFS_STATS_INC(xs_read_calls); @@ -229,11 +202,11 @@ xfs_read( if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((*offset & target->bt_smask) || (size & target->bt_smask)) { - if (*offset == ip->i_d.di_size) { + if (*offset == ip->i_size) { return (0); } return -XFS_ERROR(EINVAL); @@ -254,14 +227,12 @@ xfs_read( mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && - !(ioflags & IO_INVIS)) { - bhv_vrwlock_t locktype = VRWLOCK_READ; + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); + int iolock = XFS_IOLOCK_SHARED; - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, - BHV_TO_VNODE(bdp), *offset, size, - dmflags, &locktype); + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, + dmflags, &iolock); if (ret) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); if (unlikely(ioflags & IO_ISDIRECT)) @@ -270,16 +241,22 @@ xfs_read( } } - if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp))) - bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), - -1, FI_REMAPF_LOCKED); - - if (unlikely(ioflags & IO_ISDIRECT)) + if (unlikely(ioflags & IO_ISDIRECT)) { + if (inode->i_mapping->nrpages) + ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); mutex_unlock(&inode->i_mutex); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; + } + } - xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, + xfs_rw_enter_trace(XFS_READ_ENTER, ip, (void *)iovp, segs, *offset, ioflags); - ret = __generic_file_aio_read(iocb, iovp, segs, offset); + + iocb->ki_pos = *offset; + ret = generic_file_aio_read(iocb, iovp, segs, *offset); if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) ret = wait_on_sync_kiocb(iocb); if (ret > 0) @@ -290,61 +267,15 @@ xfs_read( } ssize_t -xfs_sendfile( - bhv_desc_t *bdp, - struct file *filp, - loff_t *offset, - int ioflags, - size_t count, - read_actor_t actor, - void *target, - cred_t *credp) -{ - xfs_inode_t *ip = XFS_BHVTOI(bdp); - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - - XFS_STATS_INC(xs_read_calls); - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && - (!(ioflags & IO_INVIS))) { - bhv_vrwlock_t locktype = VRWLOCK_READ; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - *offset, count, - FILP_DELAY_FLAG(filp), &locktype); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, - (void *)(unsigned long)target, count, *offset, ioflags); - ret = generic_file_sendfile(filp, offset, count, actor, target); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t xfs_splice_read( - bhv_desc_t *bdp, + xfs_inode_t *ip, struct file *infilp, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, int flags, - int ioflags, - cred_t *credp) + int ioflags) { - xfs_inode_t *ip = XFS_BHVTOI(bdp); xfs_mount_t *mp = ip->i_mount; ssize_t ret; @@ -354,20 +285,18 @@ xfs_splice_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && - (!(ioflags & IO_INVIS))) { - bhv_vrwlock_t locktype = VRWLOCK_READ; + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_SHARED; int error; - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - *ppos, count, - FILP_DELAY_FLAG(infilp), &locktype); + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, + FILP_DELAY_FLAG(infilp), &iolock); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); return -error; } } - xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, + xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip, pipe, count, *ppos, ioflags); ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) @@ -379,20 +308,18 @@ xfs_splice_read( ssize_t xfs_splice_write( - bhv_desc_t *bdp, + xfs_inode_t *ip, struct pipe_inode_info *pipe, struct file *outfilp, loff_t *ppos, size_t count, int flags, - int ioflags, - cred_t *credp) + int ioflags) { - xfs_inode_t *ip = XFS_BHVTOI(bdp); xfs_mount_t *mp = ip->i_mount; ssize_t ret; struct inode *inode = outfilp->f_mapping->host; - xfs_fsize_t isize; + xfs_fsize_t isize, new_size; XFS_STATS_INC(xs_write_calls); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -400,20 +327,26 @@ xfs_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); - if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) && - (!(ioflags & IO_INVIS))) { - bhv_vrwlock_t locktype = VRWLOCK_WRITE; + if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_EXCL; int error; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), - *ppos, count, - FILP_DELAY_FLAG(outfilp), &locktype); + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, + FILP_DELAY_FLAG(outfilp), &iolock); if (error) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); return -error; } } - xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, + + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + ip->i_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip, pipe, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); if (ret > 0) @@ -423,14 +356,18 @@ xfs_splice_write( if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) *ppos = isize; - if (*ppos > ip->i_d.di_size) { + if (*ppos > ip->i_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_d.di_size) { - ip->i_d.di_size = *ppos; - i_size_write(inode, *ppos); - ip->i_update_core = 1; - ip->i_update_size = 1; - } + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; xfs_iunlock(ip, XFS_ILOCK_EXCL); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -445,21 +382,19 @@ xfs_splice_write( */ STATIC int /* error (positive) */ xfs_zero_last_block( - struct inode *ip, - xfs_iocore_t *io, - xfs_fsize_t isize, - xfs_fsize_t end_size) + xfs_inode_t *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) { xfs_fileoff_t last_fsb; - xfs_mount_t *mp = io->io_mount; + xfs_mount_t *mp = ip->i_mount; int nimaps; int zero_offset; int zero_len; int error = 0; xfs_bmbt_irec_t imap; - loff_t loff; - ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); zero_offset = XFS_B_FSB_OFFSET(mp, isize); if (zero_offset == 0) { @@ -472,7 +407,7 @@ xfs_zero_last_block( last_fsb = XFS_B_TO_FSBT(mp, isize); nimaps = 1; - error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, + error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, &nimaps, NULL, NULL); if (error) { return error; @@ -490,13 +425,14 @@ xfs_zero_last_block( * out sync. We need to drop the ilock while we do this so we * don't deadlock when the buffer cache calls back to us. */ - XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); + xfs_iunlock(ip, XFS_ILOCK_EXCL); - loff = XFS_FSB_TO_B(mp, last_fsb); zero_len = mp->m_sb.sb_blocksize - zero_offset; - error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); + if (isize + zero_len > offset) + zero_len = offset - isize; + error = xfs_iozero(ip, isize, zero_len); - XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_ilock(ip, XFS_ILOCK_EXCL); ASSERT(error >= 0); return error; } @@ -514,34 +450,31 @@ xfs_zero_last_block( int /* error (positive) */ xfs_zero_eof( - bhv_vnode_t *vp, - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize, /* current inode size */ - xfs_fsize_t end_size) /* terminal inode size */ + xfs_fsize_t isize) /* current inode size */ { - struct inode *ip = vn_to_inode(vp); + xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t start_zero_fsb; xfs_fileoff_t end_zero_fsb; xfs_fileoff_t zero_count_fsb; xfs_fileoff_t last_fsb; - xfs_mount_t *mp = io->io_mount; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; int nimaps; int error = 0; xfs_bmbt_irec_t imap; - ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); - ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); ASSERT(offset > isize); /* * First handle zeroing the block on which isize resides. * We only zero a part of that block so it is handled specially. */ - error = xfs_zero_last_block(ip, io, isize, end_size); + error = xfs_zero_last_block(ip, offset, isize); if (error) { - ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); - ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; } @@ -569,11 +502,10 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, + error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 0, NULL, 0, &imap, &nimaps, NULL, NULL); if (error) { - ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); - ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; } ASSERT(nimaps > 0); @@ -597,12 +529,15 @@ xfs_zero_eof( * Drop the inode lock while we're doing the I/O. * We'll still have the iolock to protect us. */ - XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; - error = xfs_iozero(ip, - XFS_FSB_TO_B(mp, start_zero_fsb), - XFS_FSB_TO_B(mp, imap.br_blockcount), - end_size); + error = xfs_iozero(ip, zero_off, zero_len); if (error) { goto out_lock; } @@ -610,69 +545,44 @@ xfs_zero_eof( start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_ilock(ip, XFS_ILOCK_EXCL); } return 0; out_lock: - - XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_ilock(ip, XFS_ILOCK_EXCL); ASSERT(error >= 0); return error; } ssize_t /* bytes written, or (-) error */ xfs_write( - bhv_desc_t *bdp, + struct xfs_inode *xip, struct kiocb *iocb, const struct iovec *iovp, unsigned int nsegs, loff_t *offset, - int ioflags, - cred_t *credp) + int ioflags) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long segs = nsegs; - xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret = 0, error = 0; xfs_fsize_t isize, new_size; - xfs_iocore_t *io; - bhv_vnode_t *vp; - unsigned long seg; int iolock; int eventsent = 0; - bhv_vrwlock_t locktype; size_t ocount = 0, count; loff_t pos; - int need_i_mutex = 1, need_flush = 0; + int need_i_mutex; XFS_STATS_INC(xs_write_calls); - vp = BHV_TO_VNODE(bdp); - xip = XFS_BHVTOI(bdp); - - for (seg = 0; seg < segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - ocount += iv->iov_len; - if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - segs = seg; - ocount -= iv->iov_len; /* This segment is no good */ - break; - } + error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); + if (error) + return error; count = ocount; pos = *offset; @@ -680,47 +590,25 @@ xfs_write( if (count == 0) return 0; - io = &xip->i_iocore; - mp = io->io_mount; + mp = xip->i_mount; - vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE); + xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) - return XFS_ERROR(-EINVAL); - - if (!VN_CACHED(vp) && pos < i_size_read(inode)) - need_i_mutex = 0; - - if (VN_CACHED(vp)) - need_flush = 1; - } - relock: - if (need_i_mutex) { + if (ioflags & IO_ISDIRECT) { + iolock = XFS_IOLOCK_SHARED; + need_i_mutex = 0; + } else { iolock = XFS_IOLOCK_EXCL; - locktype = VRWLOCK_WRITE; - + need_i_mutex = 1; mutex_lock(&inode->i_mutex); - } else { - iolock = XFS_IOLOCK_SHARED; - locktype = VRWLOCK_WRITE_DIRECT; } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - isize = i_size_read(inode); - - if (file->f_flags & O_APPEND) - *offset = isize; - start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); @@ -729,25 +617,18 @@ start: goto out_unlock_mutex; } - new_size = pos + count; - if (new_size > isize) - io->io_new_size = new_size; - - if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && + if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { - loff_t savedsize = pos; int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; xfs_iunlock(xip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, - pos, count, - dmflags, &locktype); + error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, + pos, count, dmflags, &iolock); if (error) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; + goto out_unlock_internal; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; @@ -759,16 +640,42 @@ start: * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ - if ((file->f_flags & O_APPEND) && savedsize != isize) { - pos = isize = xip->i_d.di_size; + if ((file->f_flags & O_APPEND) && pos != xip->i_size) + goto start; + } + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(xip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->bt_smask) || (count & target->bt_smask)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + return XFS_ERROR(-EINVAL); + } + + if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); goto start; } } - if (likely(!(ioflags & IO_INVIS))) { - file_update_time(file); - xfs_ichgtime_fast(xip, inode, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + new_size = pos + count; + if (new_size > xip->i_size) + xip->i_new_size = new_size; + + /* + * We're not supposed to change timestamps in readonly-mounted + * filesystems. Throw it away if anyone asks us. + */ + if (likely(!(ioflags & IO_INVIS) && + !mnt_want_write(file->f_path.mnt))) { + xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + mnt_drop_write(file->f_path.mnt); } /* @@ -780,12 +687,11 @@ start: * to zero it out up to the new size. */ - if (pos > isize) { - error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, - isize, pos + count); + if (pos > xip->i_size) { + error = xfs_zero_eof(xip, pos, xip->i_size); if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + goto out_unlock_internal; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); @@ -803,10 +709,9 @@ start: !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) - error = -remove_suid(file->f_dentry); + error = -file_remove_suid(file); if (unlikely(error)) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; + goto out_unlock_internal; } } @@ -815,24 +720,27 @@ retry: current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { - if (need_flush) { - xfs_inval_cached_trace(io, pos, -1, - ctooff(offtoct(pos)), -1); - bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)), + if (mapping->nrpages) { + WARN_ON(need_i_mutex == 0); + xfs_inval_cached_trace(xip, pos, -1, + (pos & PAGE_CACHE_MASK), -1); + error = xfs_flushinval_pages(xip, + (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); + if (error) + goto out_unlock_internal; } if (need_i_mutex) { /* demote the lock now the cached pages are gone */ - XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); + xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; - locktype = VRWLOCK_WRITE_DIRECT; need_i_mutex = 0; } - xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, + xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs, *offset, ioflags); ret = generic_file_direct_write(iocb, iovp, &segs, pos, offset, count, ocount); @@ -847,13 +755,12 @@ retry: pos += ret; count -= ret; - need_i_mutex = 1; ioflags &= ~IO_ISDIRECT; xfs_iunlock(xip, iolock); goto relock; } } else { - xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, + xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, *offset, ioflags); ret = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); @@ -864,22 +771,20 @@ retry: if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) ret = wait_on_sync_kiocb(iocb); - if ((ret == -ENOSPC) && - DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && - !(ioflags & IO_INVIS)) { - - xfs_rwunlock(bdp, locktype); + if (ret == -ENOSPC && + DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { + xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, - DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, + error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, + DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ - if (error) - goto out_nounlocks; if (need_i_mutex) mutex_lock(&inode->i_mutex); - xfs_rwlock(bdp, locktype); - pos = xip->i_d.di_size; + xfs_ilock(xip, iolock); + if (error) + goto out_unlock_internal; + pos = xip->i_size; ret = 0; goto retry; } @@ -888,14 +793,10 @@ retry: if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; - if (*offset > xip->i_d.di_size) { + if (*offset > xip->i_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_d.di_size) { - xip->i_d.di_size = *offset; - i_size_write(inode, *offset); - xip->i_update_core = 1; - xip->i_update_size = 1; - } + if (*offset > xip->i_size) + xip->i_size = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); } @@ -907,26 +808,41 @@ retry: /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { - error = xfs_write_sync_logforce(mp, xip); - if (error) - goto out_unlock_internal; + int error2; - xfs_rwunlock(bdp, locktype); + xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - - error = sync_page_range(inode, mapping, pos, ret); + error2 = sync_page_range(inode, mapping, pos, ret); if (!error) - error = ret; - return error; + error = error2; + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(xip, iolock); + error2 = xfs_write_sync_logforce(mp, xip); + if (!error) + error = error2; } out_unlock_internal: - xfs_rwunlock(bdp, locktype); + if (xip->i_new_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + xip->i_new_size = 0; + /* + * If this was a direct or synchronous I/O that failed (such + * as ENOSPC) then part of the I/O may have been written to + * disk before the error occured. In this case the on-disk + * file size may have been adjusted beyond the in-memory file + * size and now needs to be truncated back. + */ + if (xip->i_d.di_size > xip->i_size) + xip->i_d.di_size = xip->i_size; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + xfs_iunlock(xip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); - out_nounlocks: return -error; } @@ -960,48 +876,24 @@ xfs_bdstrat_cb(struct xfs_buf *bp) } } - -int -xfs_bmap(bhv_desc_t *bdp, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) -{ - xfs_inode_t *ip = XFS_BHVTOI(bdp); - xfs_iocore_t *io = &ip->i_iocore; - - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == - ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); - - return xfs_iomap(io, offset, count, flags, iomapp, niomaps); -} - /* - * Wrapper around bdstrat so that we can stop data - * from going to disk in case we are shutting down the filesystem. - * Typically user data goes thru this path; one of the exceptions - * is the superblock. + * Wrapper around bdstrat so that we can stop data from going to disk in case + * we are shutting down the filesystem. Typically user data goes thru this + * path; one of the exceptions is the superblock. */ -int +void xfsbdstrat( struct xfs_mount *mp, struct xfs_buf *bp) { ASSERT(mp); if (!XFS_FORCED_SHUTDOWN(mp)) { - /* Grio redirection would go here - * if (XFS_BUF_IS_GRIO(bp)) { - */ - xfs_buf_iorequest(bp); - return 0; + return; } xfs_buftrace("XFSBDSTRAT IOERROR", bp); - return (xfs_bioerror_relse(bp)); + xfs_bioerror_relse(bp); } /* diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index c77e62e..e6be37d 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -18,10 +18,7 @@ #ifndef __XFS_LRW_H__ #define __XFS_LRW_H__ -struct bhv_desc; -struct bhv_vnode; struct xfs_mount; -struct xfs_iocore; struct xfs_inode; struct xfs_bmbt_irec; struct xfs_buf; @@ -53,7 +50,6 @@ struct xfs_iomap; #define XFS_INVAL_CACHED 18 #define XFS_DIORD_ENTER 19 #define XFS_DIOWR_ENTER 20 -#define XFS_SENDFILE_ENTER 21 #define XFS_WRITEPAGE_ENTER 22 #define XFS_RELEASEPAGE_ENTER 23 #define XFS_INVALIDPAGE_ENTER 24 @@ -62,42 +58,20 @@ struct xfs_iomap; #define XFS_IOMAP_UNWRITTEN 27 #define XFS_SPLICE_READ_ENTER 28 #define XFS_SPLICE_WRITE_ENTER 29 -extern void xfs_rw_enter_trace(int, struct xfs_iocore *, - void *, size_t, loff_t, int); -extern void xfs_inval_cached_trace(struct xfs_iocore *, - xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t); +extern void xfs_rw_enter_trace(int, struct xfs_inode *, + void *, size_t, loff_t, int); +extern void xfs_inval_cached_trace(struct xfs_inode *, + xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t); #else -#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags) -#define xfs_inval_cached_trace(io, offset, len, first, last) +#define xfs_rw_enter_trace(tag, ip, data, size, offset, ioflags) +#define xfs_inval_cached_trace(ip, offset, len, first, last) #endif -/* - * Maximum count of bmaps used by read and write paths. - */ -#define XFS_MAX_RW_NBMAPS 4 - -extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); +/* errors from xfsbdstrat() must be extracted from the buffer */ +extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); -extern int xfs_zero_eof(struct bhv_vnode *, struct xfs_iocore *, xfs_off_t, - xfs_fsize_t, xfs_fsize_t); -extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, - loff_t *, int, size_t, read_actor_t, - void *, struct cred *); -extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, - struct pipe_inode_info *, size_t, int, int, - struct cred *); -extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, - struct file *, loff_t *, size_t, int, int, - struct cred *); +extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); #endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c index e480b61..3d5b67c 100644 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -98,12 +98,21 @@ xfs_read_xfsstats( return len; } -void +int xfs_init_procfs(void) { if (!proc_mkdir("fs/xfs", NULL)) - return; - create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL); + goto out; + + if (!create_proc_read_entry("fs/xfs/stat", 0, NULL, + xfs_read_xfsstats, NULL)) + goto out_remove_entry; + return 0; + + out_remove_entry: + remove_proc_entry("fs/xfs", NULL); + out: + return -ENOMEM; } void diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h index 8ba7a2f..e83820f 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -134,7 +134,7 @@ DECLARE_PER_CPU(struct xfsstats, xfsstats); #define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) #define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) -extern void xfs_init_procfs(void); +extern int xfs_init_procfs(void); extern void xfs_cleanup_procfs(void); @@ -144,8 +144,14 @@ extern void xfs_cleanup_procfs(void); # define XFS_STATS_DEC(count) # define XFS_STATS_ADD(count, inc) -static __inline void xfs_init_procfs(void) { }; -static __inline void xfs_cleanup_procfs(void) { }; +static inline int xfs_init_procfs(void) +{ + return 0; +} + +static inline void xfs_cleanup_procfs(void) +{ +} #endif /* !CONFIG_PROC_FS */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9df9ed3..37ebe36 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -41,14 +41,23 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" +#include "xfs_fsops.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" +#include "xfs_vnodeops.h" +#include "xfs_vfsops.h" #include "xfs_version.h" +#include "xfs_log_priv.h" +#include "xfs_trans_priv.h" +#include "xfs_filestream.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_trace.h" +#include "xfs_extfree_item.h" +#include "xfs_mru_cache.h" +#include "xfs_inode_item.h" #include <linux/namei.h> #include <linux/init.h> @@ -56,11 +65,13 @@ #include <linux/mempool.h> #include <linux/writeback.h> #include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/parser.h> -STATIC struct quotactl_ops xfs_quotactl_operations; -STATIC struct super_operations xfs_super_operations; -STATIC kmem_zone_t *xfs_vnode_zone; -STATIC kmem_zone_t *xfs_ioend_zone; +static struct quotactl_ops xfs_quotactl_operations; +static struct super_operations xfs_super_operations; +static kmem_zone_t *xfs_vnode_zone; +static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; STATIC struct xfs_mount_args * @@ -70,7 +81,10 @@ xfs_args_allocate( { struct xfs_mount_args *args; - args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP); + args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL); + if (!args) + return NULL; + args->logbufs = args->logbufsize = -1; strncpy(args->fsname, sb->s_id, MAXNAMELEN); @@ -86,6 +100,453 @@ xfs_args_allocate( return args; } +#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ +#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ +#define MNTOPT_LOGDEV "logdev" /* log device */ +#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ +#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ +#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ +#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */ +#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ +#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ +#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ +#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ +#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ +#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ +#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ +#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ +#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ +#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ +#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ +#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ +#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and + * unwritten extent conversion */ +#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ +#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ +#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ +#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ +#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ +#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes + * in stat(). */ +#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ +#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ +#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ +#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ +#define MNTOPT_NOQUOTA "noquota" /* no quotas */ +#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ +#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ +#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ +#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ +#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ +#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ +#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ +#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ +#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ + +/* + * Table driven mount option parser. + * + * Currently only used for remount, but it will be used for mount + * in the future, too. + */ +enum { + Opt_barrier, Opt_nobarrier, Opt_err +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_err, NULL} +}; + + +STATIC unsigned long +suffix_strtoul(char *s, char **endp, unsigned int base) +{ + int last, shift_left_factor = 0; + char *value = s; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + return simple_strtoul((const char *)s, endp, base) << shift_left_factor; +} + +STATIC int +xfs_parseargs( + struct xfs_mount *mp, + char *options, + struct xfs_mount_args *args, + int update) +{ + char *this_char, *value, *eov; + int dsunit, dswidth, vol_dsunit, vol_dswidth; + int iosize; + int dmapi_implies_ikeep = 1; + + args->flags |= XFSMNT_BARRIER; + args->flags2 |= XFSMNT2_COMPAT_IOSIZE; + + if (!options) + goto done; + + iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, MNTOPT_LOGBUFS)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + args->logbufs = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + args->logbufsize = suffix_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + strncpy(args->logname, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_MTPT)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + strncpy(args->mtpt, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_RTDEV)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + strncpy(args->rtname, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + iosize = simple_strtoul(value, &eov, 10); + args->flags |= XFSMNT_IOSIZE; + args->iosizelog = (uint8_t) iosize; + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + iosize = suffix_strtoul(value, &eov, 10); + args->flags |= XFSMNT_IOSIZE; + args->iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_GRPID) || + !strcmp(this_char, MNTOPT_BSDGROUPS)) { + mp->m_flags |= XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_NOGRPID) || + !strcmp(this_char, MNTOPT_SYSVGROUPS)) { + mp->m_flags &= ~XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + args->flags |= XFSMNT_WSYNC; + } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { + args->flags |= XFSMNT_OSYNCISOSYNC; + } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + args->flags |= XFSMNT_NORECOVERY; + } else if (!strcmp(this_char, MNTOPT_INO64)) { + args->flags |= XFSMNT_INO64; +#if !XFS_BIG_INUMS + cmn_err(CE_WARN, + "XFS: %s option not allowed on this system", + this_char); + return EINVAL; +#endif + } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + args->flags |= XFSMNT_NOALIGN; + } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { + args->flags |= XFSMNT_SWALLOC; + } else if (!strcmp(this_char, MNTOPT_SUNIT)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + dsunit = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { + if (!value || !*value) { + cmn_err(CE_WARN, + "XFS: %s option requires an argument", + this_char); + return EINVAL; + } + dswidth = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { + args->flags &= ~XFSMNT_32BITINODES; +#if !XFS_BIG_INUMS + cmn_err(CE_WARN, + "XFS: %s option not allowed on this system", + this_char); + return EINVAL; +#endif + } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + args->flags |= XFSMNT_NOUUID; + } else if (!strcmp(this_char, MNTOPT_BARRIER)) { + args->flags |= XFSMNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { + args->flags &= ~XFSMNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + args->flags |= XFSMNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { + dmapi_implies_ikeep = 0; + args->flags &= ~XFSMNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { + args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { + args->flags2 |= XFSMNT2_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_ATTR2)) { + args->flags |= XFSMNT_ATTR2; + } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { + args->flags &= ~XFSMNT_ATTR2; + args->flags |= XFSMNT_NOATTR2; + } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { + args->flags2 |= XFSMNT2_FILESTREAMS; + } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { + args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA); + args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA); + } else if (!strcmp(this_char, MNTOPT_QUOTA) || + !strcmp(this_char, MNTOPT_UQUOTA) || + !strcmp(this_char, MNTOPT_USRQUOTA)) { + args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || + !strcmp(this_char, MNTOPT_UQUOTANOENF)) { + args->flags |= XFSMNT_UQUOTA; + args->flags &= ~XFSMNT_UQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_PQUOTA) || + !strcmp(this_char, MNTOPT_PRJQUOTA)) { + args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { + args->flags |= XFSMNT_PQUOTA; + args->flags &= ~XFSMNT_PQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_GQUOTA) || + !strcmp(this_char, MNTOPT_GRPQUOTA)) { + args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + args->flags |= XFSMNT_GQUOTA; + args->flags &= ~XFSMNT_GQUOTAENF; + } else if (!strcmp(this_char, MNTOPT_DMAPI)) { + args->flags |= XFSMNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_XDSM)) { + args->flags |= XFSMNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_DMI)) { + args->flags |= XFSMNT_DMAPI; + } else if (!strcmp(this_char, "ihashsize")) { + cmn_err(CE_WARN, + "XFS: ihashsize no longer used, option is deprecated."); + } else if (!strcmp(this_char, "osyncisdsync")) { + /* no-op, this is now the default */ + cmn_err(CE_WARN, + "XFS: osyncisdsync is now the default, option is deprecated."); + } else if (!strcmp(this_char, "irixsgid")) { + cmn_err(CE_WARN, + "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); + } else { + cmn_err(CE_WARN, + "XFS: unknown mount option [%s].", this_char); + return EINVAL; + } + } + + if (args->flags & XFSMNT_NORECOVERY) { + if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) { + cmn_err(CE_WARN, + "XFS: no-recovery mounts must be read-only."); + return EINVAL; + } + } + + if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) { + cmn_err(CE_WARN, + "XFS: sunit and swidth options incompatible with the noalign option"); + return EINVAL; + } + + if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) { + cmn_err(CE_WARN, + "XFS: cannot mount with both project and group quota"); + return EINVAL; + } + + if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') { + printk("XFS: %s option needs the mount point option as well\n", + MNTOPT_DMAPI); + return EINVAL; + } + + if ((dsunit && !dswidth) || (!dsunit && dswidth)) { + cmn_err(CE_WARN, + "XFS: sunit and swidth must be specified together"); + return EINVAL; + } + + if (dsunit && (dswidth % dsunit != 0)) { + cmn_err(CE_WARN, + "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", + dswidth, dsunit); + return EINVAL; + } + + /* + * Applications using DMI filesystems often expect the + * inode generation number to be monotonically increasing. + * If we delete inode chunks we break this assumption, so + * keep unused inode chunks on disk for DMI filesystems + * until we come up with a better solution. + * Note that if "ikeep" or "noikeep" mount options are + * supplied, then they are honored. + */ + if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep) + args->flags |= XFSMNT_IKEEP; + + if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { + if (dsunit) { + args->sunit = dsunit; + args->flags |= XFSMNT_RETERR; + } else { + args->sunit = vol_dsunit; + } + dswidth ? (args->swidth = dswidth) : + (args->swidth = vol_dswidth); + } else { + args->sunit = args->swidth = 0; + } + +done: + if (args->flags & XFSMNT_32BITINODES) + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + if (args->flags2) + args->flags |= XFSMNT_FLAGS2; + return 0; +} + +struct proc_xfs_info { + int flag; + char *str; +}; + +STATIC int +xfs_showargs( + struct xfs_mount *mp, + struct seq_file *m) +{ + static struct proc_xfs_info xfs_info_set[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, + { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, + { XFS_MOUNT_INO64, "," MNTOPT_INO64 }, + { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, + { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, + { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, + { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, + { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, + { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, + { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, + { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, + { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { 0, NULL } + }; + static struct proc_xfs_info xfs_info_unset[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO }, + { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE }, + { 0, NULL } + }; + struct proc_xfs_info *xfs_infop; + + for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { + if (mp->m_flags & xfs_infop->flag) + seq_puts(m, xfs_infop->str); + } + for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { + if (!(mp->m_flags & xfs_infop->flag)) + seq_puts(m, xfs_infop->str); + } + + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", + (int)(1 << mp->m_writeio_log) >> 10); + + if (mp->m_logbufs > 0) + seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); + if (mp->m_logbsize > 0) + seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); + + if (mp->m_logname) + seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + if (mp->m_rtname) + seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + + if (mp->m_dalign > 0) + seq_printf(m, "," MNTOPT_SUNIT "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); + if (mp->m_swidth > 0) + seq_printf(m, "," MNTOPT_SWIDTH "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); + + if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) + seq_puts(m, "," MNTOPT_USRQUOTA); + else if (mp->m_qflags & XFS_UQUOTA_ACCT) + seq_puts(m, "," MNTOPT_UQUOTANOENF); + + if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) + seq_puts(m, "," MNTOPT_PRJQUOTA); + else if (mp->m_qflags & XFS_PQUOTA_ACCT) + seq_puts(m, "," MNTOPT_PQUOTANOENF); + + if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD)) + seq_puts(m, "," MNTOPT_GRPQUOTA); + else if (mp->m_qflags & XFS_GQUOTA_ACCT) + seq_puts(m, "," MNTOPT_GQUOTANOENF); + + if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) + seq_puts(m, "," MNTOPT_NOQUOTA); + + return 0; +} __uint64_t xfs_max_file_offset( unsigned int blockshift) @@ -120,120 +581,6 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } -STATIC __inline__ void -xfs_set_inodeops( - struct inode *inode) -{ - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &xfs_inode_operations; - inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - case S_IFDIR: - inode->i_op = &xfs_dir_inode_operations; - inode->i_fop = &xfs_dir_file_operations; - break; - case S_IFLNK: - inode->i_op = &xfs_symlink_inode_operations; - if (inode->i_blocks) - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - default: - inode->i_op = &xfs_inode_operations; - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; - } -} - -STATIC __inline__ void -xfs_revalidate_inode( - xfs_mount_t *mp, - bhv_vnode_t *vp, - xfs_inode_t *ip) -{ - struct inode *inode = vn_to_inode(vp); - - inode->i_mode = ip->i_d.di_mode; - inode->i_nlink = ip->i_d.di_nlink; - inode->i_uid = ip->i_d.di_uid; - inode->i_gid = ip->i_d.di_gid; - - switch (inode->i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - inode->i_rdev = - MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); - break; - default: - inode->i_rdev = 0; - break; - } - - inode->i_generation = ip->i_d.di_gen; - i_size_write(inode, ip->i_d.di_size); - inode->i_blocks = - XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; - inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; - inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; - vp->v_flag &= ~VMODIFIED; -} - -void -xfs_initialize_vnode( - bhv_desc_t *bdp, - bhv_vnode_t *vp, - bhv_desc_t *inode_bhv, - int unlock) -{ - xfs_inode_t *ip = XFS_BHVTOI(inode_bhv); - struct inode *inode = vn_to_inode(vp); - - if (!inode_bhv->bd_vobj) { - vp->v_vfsp = bhvtovfs(bdp); - bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops); - bhv_insert(VN_BHV_HEAD(vp), inode_bhv); - } - - /* - * We need to set the ops vectors, and unlock the inode, but if - * we have been called during the new inode create process, it is - * too early to fill in the Linux inode. We will get called a - * second time once the inode is properly set up, and then we can - * finish our work. - */ - if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) { - xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip); - xfs_set_inodeops(inode); - - ip->i_flags &= ~XFS_INEW; - barrier(); - - unlock_new_inode(inode); - } -} - int xfs_blkdev_get( xfs_mount_t *mp, @@ -242,7 +589,7 @@ xfs_blkdev_get( { int error = 0; - *bdevp = open_bdev_excl(name, 0, mp); + *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); printk("XFS: Invalid device [%s], error=%d\n", name, error); @@ -256,7 +603,7 @@ xfs_blkdev_put( struct block_device *bdev) { if (bdev) - close_bdev_excl(bdev); + close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); } /* @@ -305,14 +652,6 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) return; } - if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered == - QUEUE_ORDERED_NONE) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, not supported by the underlying device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - if (xfs_readonly_buftarg(mp->m_ddev_targp)) { xfs_fs_cmn_err(CE_NOTE, mp, "Disabling barriers, underlying device is readonly"); @@ -336,70 +675,216 @@ xfs_blkdev_issue_flush( blkdev_issue_flush(buftarg->bt_bdev, NULL); } -STATIC struct inode * -xfs_fs_alloc_inode( - struct super_block *sb) +STATIC void +xfs_close_devices( + struct xfs_mount *mp) { - bhv_vnode_t *vp; + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + xfs_free_buftarg(mp->m_logdev_targp); + xfs_blkdev_put(logdev); + } + if (mp->m_rtdev_targp) { + struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + xfs_free_buftarg(mp->m_rtdev_targp); + xfs_blkdev_put(rtdev); + } + xfs_free_buftarg(mp->m_ddev_targp); +} - vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); - if (unlikely(!vp)) - return NULL; - return vn_to_inode(vp); +/* + * The file system configurations are: + * (1) device (partition) with data and internal log + * (2) logical volume with data and log subvolumes. + * (3) logical volume with data, log, and realtime subvolumes. + * + * We only have to handle opening the log and realtime volumes here if + * they are present. The data subvolume has already been opened by + * get_sb_bdev() and is stored in sb->s_bdev. + */ +STATIC int +xfs_open_devices( + struct xfs_mount *mp, + struct xfs_mount_args *args) +{ + struct block_device *ddev = mp->m_super->s_bdev; + struct block_device *logdev = NULL, *rtdev = NULL; + int error; + + /* + * Open real time and log devices - order is important. + */ + if (args->logname[0]) { + error = xfs_blkdev_get(mp, args->logname, &logdev); + if (error) + goto out; + } + + if (args->rtname[0]) { + error = xfs_blkdev_get(mp, args->rtname, &rtdev); + if (error) + goto out_close_logdev; + + if (rtdev == ddev || rtdev == logdev) { + cmn_err(CE_WARN, + "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); + error = EINVAL; + goto out_close_rtdev; + } + } + + /* + * Setup xfs_mount buffer target pointers + */ + error = ENOMEM; + mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); + if (!mp->m_ddev_targp) + goto out_close_rtdev; + + if (rtdev) { + mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); + if (!mp->m_rtdev_targp) + goto out_free_ddev_targ; + } + + if (logdev && logdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); + if (!mp->m_logdev_targp) + goto out_free_rtdev_targ; + } else { + mp->m_logdev_targp = mp->m_ddev_targp; + } + + return 0; + + out_free_rtdev_targ: + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + out_free_ddev_targ: + xfs_free_buftarg(mp->m_ddev_targp); + out_close_rtdev: + if (rtdev) + xfs_blkdev_put(rtdev); + out_close_logdev: + if (logdev && logdev != ddev) + xfs_blkdev_put(logdev); + out: + return error; } -STATIC void -xfs_fs_destroy_inode( - struct inode *inode) +/* + * Setup xfs_mount buffer target pointers based on superblock + */ +STATIC int +xfs_setup_devices( + struct xfs_mount *mp) { - kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode)); + int error; + + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (error) + return error; + + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + unsigned int log_sector_size = BBSIZE; + + if (xfs_sb_version_hassector(&mp->m_sb)) + log_sector_size = mp->m_sb.sb_logsectsize; + error = xfs_setsize_buftarg(mp->m_logdev_targp, + mp->m_sb.sb_blocksize, + log_sector_size); + if (error) + return error; + } + if (mp->m_rtdev_targp) { + error = xfs_setsize_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (error) + return error; + } + + return 0; } -STATIC void -xfs_fs_inode_init_once( - void *vnode, - kmem_zone_t *zonep, - unsigned long flags) +/* + * XFS AIL push thread support + */ +void +xfsaild_wakeup( + xfs_mount_t *mp, + xfs_lsn_t threshold_lsn) { - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(vn_to_inode((bhv_vnode_t *)vnode)); + mp->m_ail.xa_target = threshold_lsn; + wake_up_process(mp->m_ail.xa_task); } -STATIC int -xfs_init_zones(void) +int +xfsaild( + void *data) { - xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, - xfs_fs_inode_init_once); - if (!xfs_vnode_zone) - goto out; + xfs_mount_t *mp = (xfs_mount_t *)data; + xfs_lsn_t last_pushed_lsn = 0; + long tout = 0; - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); - if (!xfs_ioend_zone) - goto out_destroy_vnode_zone; + while (!kthread_should_stop()) { + if (tout) + schedule_timeout_interruptible(msecs_to_jiffies(tout)); + tout = 1000; + + /* swsusp */ + try_to_freeze(); + + ASSERT(mp->m_log); + if (XFS_FORCED_SHUTDOWN(mp)) + continue; + + tout = xfsaild_push(mp, &last_pushed_lsn); + } - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, - xfs_ioend_zone); - if (!xfs_ioend_pool) - goto out_free_ioend_zone; return 0; +} /* xfsaild */ - out_free_ioend_zone: - kmem_zone_destroy(xfs_ioend_zone); - out_destroy_vnode_zone: - kmem_zone_destroy(xfs_vnode_zone); - out: - return -ENOMEM; +int +xfsaild_start( + xfs_mount_t *mp) +{ + mp->m_ail.xa_target = 0; + mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild"); + if (IS_ERR(mp->m_ail.xa_task)) + return -PTR_ERR(mp->m_ail.xa_task); + return 0; +} + +void +xfsaild_stop( + xfs_mount_t *mp) +{ + kthread_stop(mp->m_ail.xa_task); +} + + + +STATIC struct inode * +xfs_fs_alloc_inode( + struct super_block *sb) +{ + return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); } STATIC void -xfs_destroy_zones(void) +xfs_fs_destroy_inode( + struct inode *inode) { - mempool_destroy(xfs_ioend_pool); - kmem_zone_destroy(xfs_vnode_zone); - kmem_zone_destroy(xfs_ioend_zone); + kmem_zone_free(xfs_vnode_zone, inode); +} + +STATIC void +xfs_fs_inode_init_once( + void *vnode) +{ + inode_init_once((struct inode *)vnode); } /* @@ -413,17 +898,22 @@ xfs_fs_write_inode( struct inode *inode, int sync) { - bhv_vnode_t *vp = vn_from_inode(inode); - int error = 0, flags = FLUSH_INODE; + int error = 0; + int flags = 0; - if (vp) { - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - if (sync) - flags |= FLUSH_SYNC; - error = bhv_vop_iflush(vp, flags); - if (error == EAGAIN) - error = sync? bhv_vop_iflush(vp, flags | FLUSH_LOG) : 0; + xfs_itrace_entry(XFS_I(inode)); + if (sync) { + filemap_fdatawait(inode->i_mapping); + flags |= FLUSH_SYNC; } + error = xfs_inode_flush(XFS_I(inode), flags); + /* + * if we failed to write out the inode then mark + * it dirty again so we'll try again later. + */ + if (error) + mark_inode_dirty_sync(inode); + return -error; } @@ -431,35 +921,26 @@ STATIC void xfs_fs_clear_inode( struct inode *inode) { - bhv_vnode_t *vp = vn_from_inode(inode); - - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - XFS_STATS_INC(vn_rele); - XFS_STATS_INC(vn_remove); - XFS_STATS_INC(vn_reclaim); - XFS_STATS_DEC(vn_active); + xfs_inode_t *ip = XFS_I(inode); /* - * This can happen because xfs_iget_core calls xfs_idestroy if we + * ip can be null when xfs_iget_core calls xfs_idestroy if we * find an inode with di_mode == 0 but without IGET_CREATE set. */ - if (VNHEAD(vp)) - bhv_vop_inactive(vp, NULL); - - VN_LOCK(vp); - vp->v_flag &= ~VMODIFIED; - VN_UNLOCK(vp, 0); - - if (VNHEAD(vp)) - if (bhv_vop_reclaim(vp)) - panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, vp); - - ASSERT(VNHEAD(vp) == NULL); + if (ip) { + xfs_itrace_entry(ip); + XFS_STATS_INC(vn_rele); + XFS_STATS_INC(vn_remove); + XFS_STATS_INC(vn_reclaim); + XFS_STATS_DEC(vn_active); + + xfs_inactive(ip); + xfs_iflags_clear(ip, XFS_IMODIFIED); + if (xfs_reclaim(ip)) + panic("%s: cannot reclaim 0x%p\n", __func__, inode); + } -#ifdef XFS_VNODE_TRACE - ktrace_free(vp->v_trace); -#endif + ASSERT(XFS_I(inode) == NULL); } /* @@ -471,9 +952,9 @@ xfs_fs_clear_inode( */ STATIC void xfs_syncd_queue_work( - struct bhv_vfs *vfs, + struct xfs_mount *mp, void *data, - void (*syncer)(bhv_vfs_t *, void *)) + void (*syncer)(struct xfs_mount *, void *)) { struct bhv_vfs_sync_work *work; @@ -481,11 +962,11 @@ xfs_syncd_queue_work( INIT_LIST_HEAD(&work->w_list); work->w_syncer = syncer; work->w_data = data; - work->w_vfs = vfs; - spin_lock(&vfs->vfs_sync_lock); - list_add_tail(&work->w_list, &vfs->vfs_sync_list); - spin_unlock(&vfs->vfs_sync_lock); - wake_up_process(vfs->vfs_sync_task); + work->w_mount = mp; + spin_lock(&mp->m_sync_lock); + list_add_tail(&work->w_list, &mp->m_sync_list); + spin_unlock(&mp->m_sync_lock); + wake_up_process(mp->m_sync_task); } /* @@ -496,22 +977,22 @@ xfs_syncd_queue_work( */ STATIC void xfs_flush_inode_work( - bhv_vfs_t *vfs, - void *inode) + struct xfs_mount *mp, + void *arg) { - filemap_flush(((struct inode *)inode)->i_mapping); - iput((struct inode *)inode); + struct inode *inode = arg; + filemap_flush(inode->i_mapping); + iput(inode); } void xfs_flush_inode( xfs_inode_t *ip) { - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); - struct bhv_vfs *vfs = XFS_MTOVFS(ip->i_mount); + struct inode *inode = VFS_I(ip); igrab(inode); - xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work); + xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work); delay(msecs_to_jiffies(500)); } @@ -521,123 +1002,158 @@ xfs_flush_inode( */ STATIC void xfs_flush_device_work( - bhv_vfs_t *vfs, - void *inode) + struct xfs_mount *mp, + void *arg) { - sync_blockdev(vfs->vfs_super->s_bdev); - iput((struct inode *)inode); + struct inode *inode = arg; + sync_blockdev(mp->m_super->s_bdev); + iput(inode); } void xfs_flush_device( xfs_inode_t *ip) { - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); - struct bhv_vfs *vfs = XFS_MTOVFS(ip->i_mount); + struct inode *inode = VFS_I(ip); igrab(inode); - xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work); + xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); delay(msecs_to_jiffies(500)); xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); } STATIC void -vfs_sync_worker( - bhv_vfs_t *vfsp, +xfs_sync_worker( + struct xfs_mount *mp, void *unused) { int error; - if (!(vfsp->vfs_flag & VFS_RDONLY)) - error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \ - SYNC_ATTR | SYNC_REFCACHE, NULL); - vfsp->vfs_sync_seq++; - wmb(); - wake_up(&vfsp->vfs_wait_single_sync_task); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) + error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR); + mp->m_sync_seq++; + wake_up(&mp->m_wait_single_sync_task); } STATIC int xfssyncd( void *arg) { + struct xfs_mount *mp = arg; long timeleft; - bhv_vfs_t *vfsp = (bhv_vfs_t *) arg; bhv_vfs_sync_work_t *work, *n; LIST_HEAD (tmp); + set_freezable(); timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); for (;;) { timeleft = schedule_timeout_interruptible(timeleft); /* swsusp */ try_to_freeze(); - if (kthread_should_stop() && list_empty(&vfsp->vfs_sync_list)) + if (kthread_should_stop() && list_empty(&mp->m_sync_list)) break; - spin_lock(&vfsp->vfs_sync_lock); + spin_lock(&mp->m_sync_lock); /* * We can get woken by laptop mode, to do a sync - * that's the (only!) case where the list would be * empty with time remaining. */ - if (!timeleft || list_empty(&vfsp->vfs_sync_list)) { + if (!timeleft || list_empty(&mp->m_sync_list)) { if (!timeleft) timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); - INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list); - list_add_tail(&vfsp->vfs_sync_work.w_list, - &vfsp->vfs_sync_list); + INIT_LIST_HEAD(&mp->m_sync_work.w_list); + list_add_tail(&mp->m_sync_work.w_list, + &mp->m_sync_list); } - list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list) + list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) list_move(&work->w_list, &tmp); - spin_unlock(&vfsp->vfs_sync_lock); + spin_unlock(&mp->m_sync_lock); list_for_each_entry_safe(work, n, &tmp, w_list) { - (*work->w_syncer)(vfsp, work->w_data); + (*work->w_syncer)(mp, work->w_data); list_del(&work->w_list); - if (work == &vfsp->vfs_sync_work) + if (work == &mp->m_sync_work) continue; - kmem_free(work, sizeof(struct bhv_vfs_sync_work)); + kmem_free(work); } } return 0; } -STATIC int -xfs_fs_start_syncd( - bhv_vfs_t *vfsp) -{ - vfsp->vfs_sync_work.w_syncer = vfs_sync_worker; - vfsp->vfs_sync_work.w_vfs = vfsp; - vfsp->vfs_sync_task = kthread_run(xfssyncd, vfsp, "xfssyncd"); - if (IS_ERR(vfsp->vfs_sync_task)) - return -PTR_ERR(vfsp->vfs_sync_task); - return 0; -} - STATIC void -xfs_fs_stop_syncd( - bhv_vfs_t *vfsp) +xfs_free_fsname( + struct xfs_mount *mp) { - kthread_stop(vfsp->vfs_sync_task); + kfree(mp->m_fsname); + kfree(mp->m_rtname); + kfree(mp->m_logname); } STATIC void xfs_fs_put_super( struct super_block *sb) { - bhv_vfs_t *vfsp = vfs_from_sb(sb); + struct xfs_mount *mp = XFS_M(sb); + struct xfs_inode *rip = mp->m_rootip; + int unmount_event_flags = 0; int error; - xfs_fs_stop_syncd(vfsp); - bhv_vfs_sync(vfsp, SYNC_ATTR | SYNC_DELWRI, NULL); - error = bhv_vfs_unmount(vfsp, 0, NULL); - if (error) { - printk("XFS: unmount got error=%d\n", error); - printk("%s: vfs=0x%p left dangling!\n", __FUNCTION__, vfsp); - } else { - vfs_deallocate(vfsp); + kthread_stop(mp->m_sync_task); + + xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI); + +#ifdef HAVE_DMAPI + if (mp->m_flags & XFS_MOUNT_DMAPI) { + unmount_event_flags = + (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? + 0 : DM_FLAGS_UNWANTED; + /* + * Ignore error from dmapi here, first unmount is not allowed + * to fail anyway, and second we wouldn't want to fail a + * unmount because of dmapi. + */ + XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL, + NULL, NULL, 0, 0, unmount_event_flags); } +#endif + + /* + * Blow away any referenced inode in the filestreams cache. + * This can and will cause log traffic as inodes go inactive + * here. + */ + xfs_filestream_unmount(mp); + + XFS_bflush(mp->m_ddev_targp); + error = xfs_unmount_flush(mp, 0); + WARN_ON(error); + + /* + * If we're forcing a shutdown, typically because of a media error, + * we want to make sure we invalidate dirty pages that belong to + * referenced vnodes as well. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE); + ASSERT(error != EFSCORRUPTED); + } + + if (mp->m_flags & XFS_MOUNT_DMAPI) { + XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0, + unmount_event_flags); + } + + xfs_unmountfs(mp); + xfs_freesb(mp); + xfs_icsb_destroy_counters(mp); + xfs_close_devices(mp); + xfs_qmops_put(mp); + xfs_dmops_put(mp); + xfs_free_fsname(mp); + kfree(mp); } STATIC void @@ -645,7 +1161,7 @@ xfs_fs_write_super( struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) - bhv_vfs_sync(vfs_from_sb(sb), SYNC_FSDATA, NULL); + xfs_sync(XFS_M(sb), SYNC_FSDATA); sb->s_dirt = 0; } @@ -654,35 +1170,55 @@ xfs_fs_sync_super( struct super_block *sb, int wait) { - bhv_vfs_t *vfsp = vfs_from_sb(sb); + struct xfs_mount *mp = XFS_M(sb); int error; int flags; - if (unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - flags = SYNC_QUIESCE; - else - flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0); + /* + * Treat a sync operation like a freeze. This is to work + * around a race in sync_inodes() which works in two phases + * - an asynchronous flush, which can write out an inode + * without waiting for file size updates to complete, and a + * synchronous flush, which wont do anything because the + * async flush removed the inode's dirty flag. Also + * sync_inodes() will not see any files that just have + * outstanding transactions to be flushed because we don't + * dirty the Linux inode until after the transaction I/O + * completes. + */ + if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) { + /* + * First stage of freeze - no more writers will make progress + * now we are here, so we flush delwri and delalloc buffers + * here, then wait for all I/O to complete. Data is frozen at + * that point. Metadata is not frozen, transactions can still + * occur here so don't bother flushing the buftarg (i.e + * SYNC_QUIESCE) because it'll just get dirty again. + */ + flags = SYNC_DATA_QUIESCE; + } else + flags = SYNC_FSDATA; - error = bhv_vfs_sync(vfsp, flags, NULL); + error = xfs_sync(mp, flags); sb->s_dirt = 0; if (unlikely(laptop_mode)) { - int prev_sync_seq = vfsp->vfs_sync_seq; + int prev_sync_seq = mp->m_sync_seq; /* * The disk must be active because we're syncing. * We schedule xfssyncd now (now that the disk is * active) instead of later (when it might not be). */ - wake_up_process(vfsp->vfs_sync_task); + wake_up_process(mp->m_sync_task); /* * We have to wait for the sync iteration to complete. * If we don't, the disk activity caused by the sync * will come after the sync is completed, and that * triggers another sync from laptop mode. */ - wait_event(vfsp->vfs_wait_single_sync_task, - vfsp->vfs_sync_seq != prev_sync_seq); + wait_event(mp->m_wait_single_sync_task, + mp->m_sync_seq != prev_sync_seq); } return -error; @@ -693,8 +1229,44 @@ xfs_fs_statfs( struct dentry *dentry, struct kstatfs *statp) { - return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, - vn_from_inode(dentry->d_inode)); + struct xfs_mount *mp = XFS_M(dentry->d_sb); + xfs_sb_t *sbp = &mp->m_sb; + __uint64_t fakeinos, id; + xfs_extlen_t lsize; + + statp->f_type = XFS_SB_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; + + id = huge_encode_dev(mp->m_ddev_targp->bt_dev); + statp->f_fsid.val[0] = (u32)id; + statp->f_fsid.val[1] = (u32)(id >> 32); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + + spin_lock(&mp->m_sb_lock); + statp->f_bsize = sbp->sb_blocksize; + lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; + statp->f_blocks = sbp->sb_dblocks - lsize; + statp->f_bfree = statp->f_bavail = + sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + fakeinos = statp->f_bfree << sbp->sb_inopblog; +#if XFS_BIG_INUMS + fakeinos += mp->m_inoadd; +#endif + statp->f_files = + MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + if (mp->m_maxicount) +#if XFS_BIG_INUMS + if (!mp->m_inoadd) +#endif + statp->f_files = min_t(typeof(statp->f_files), + statp->f_files, + mp->m_maxicount); + statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + spin_unlock(&mp->m_sb_lock); + + XFS_QM_DQSTATVFS(XFS_I(dentry->d_inode), statp); + return 0; } STATIC int @@ -703,22 +1275,90 @@ xfs_fs_remount( int *flags, char *options) { - bhv_vfs_t *vfsp = vfs_from_sb(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb, 0); - int error; + struct xfs_mount *mp = XFS_M(sb); + substring_t args[MAX_OPT_ARGS]; + char *p; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + mp->m_flags |= XFS_MOUNT_BARRIER; + + /* + * Test if barriers are actually working if we can, + * else delay this check until the filesystem is + * marked writeable. + */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_mountfs_check_barriers(mp); + break; + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; + default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 + printk(KERN_INFO + "XFS: mount option \"%s\" not supported for remount\n", p); + return -EINVAL; +#else + break; +#endif + } + } - error = bhv_vfs_parseargs(vfsp, options, args, 1); - if (!error) - error = bhv_vfs_mntupdate(vfsp, flags, args); - kmem_free(args, sizeof(*args)); - return -error; + /* rw/ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + mp->m_flags &= ~XFS_MOUNT_RDONLY; + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_mountfs_check_barriers(mp); + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + xfs_filestream_flush(mp); + xfs_sync(mp, SYNC_DATA_QUIESCE); + xfs_attr_quiesce(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + } + + return 0; } +/* + * Second stage of a freeze. The data is already frozen so we only + * need to take care of themetadata. Once that's done write a dummy + * record to dirty the log in case of a crash while frozen. + */ STATIC void xfs_fs_lockfs( struct super_block *sb) { - bhv_vfs_freeze(vfs_from_sb(sb)); + struct xfs_mount *mp = XFS_M(sb); + + xfs_attr_quiesce(mp); + xfs_fs_log_dummy(mp); } STATIC int @@ -726,7 +1366,7 @@ xfs_fs_show_options( struct seq_file *m, struct vfsmount *mnt) { - return -bhv_vfs_showargs(vfs_from_sb(mnt->mnt_sb), m); + return -xfs_showargs(XFS_M(mnt->mnt_sb), m); } STATIC int @@ -734,7 +1374,7 @@ xfs_fs_quotasync( struct super_block *sb, int type) { - return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XQUOTASYNC, 0, NULL); + return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL); } STATIC int @@ -742,7 +1382,7 @@ xfs_fs_getxstate( struct super_block *sb, struct fs_quota_stat *fqs) { - return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XGETQSTAT, 0, (caddr_t)fqs); + return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs); } STATIC int @@ -751,7 +1391,7 @@ xfs_fs_setxstate( unsigned int flags, int op) { - return -bhv_vfs_quotactl(vfs_from_sb(sb), op, 0, (caddr_t)&flags); + return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags); } STATIC int @@ -761,7 +1401,7 @@ xfs_fs_getxquota( qid_t id, struct fs_disk_quota *fdq) { - return -bhv_vfs_quotactl(vfs_from_sb(sb), + return -XFS_QM_QUOTACTL(XFS_M(sb), (type == USRQUOTA) ? Q_XGETQUOTA : ((type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETPQUOTA), id, (caddr_t)fdq); @@ -774,90 +1414,417 @@ xfs_fs_setxquota( qid_t id, struct fs_disk_quota *fdq) { - return -bhv_vfs_quotactl(vfs_from_sb(sb), + return -XFS_QM_QUOTACTL(XFS_M(sb), (type == USRQUOTA) ? Q_XSETQLIM : ((type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETPQLIM), id, (caddr_t)fdq); } +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + */ +STATIC int +xfs_start_flags( + struct xfs_mount_args *ap, + struct xfs_mount *mp) +{ + int error; + + /* Values are in BBs */ + if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ + mp->m_dalign = ap->sunit; + mp->m_swidth = ap->swidth; + } + + if (ap->logbufs != -1 && + ap->logbufs != 0 && + (ap->logbufs < XLOG_MIN_ICLOGS || + ap->logbufs > XLOG_MAX_ICLOGS)) { + cmn_err(CE_WARN, + "XFS: invalid logbufs value: %d [not %d-%d]", + ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return XFS_ERROR(EINVAL); + } + mp->m_logbufs = ap->logbufs; + if (ap->logbufsize != -1 && + ap->logbufsize != 0 && + (ap->logbufsize < XLOG_MIN_RECORD_BSIZE || + ap->logbufsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(ap->logbufsize))) { + cmn_err(CE_WARN, + "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + ap->logbufsize); + return XFS_ERROR(EINVAL); + } + + error = ENOMEM; + + mp->m_logbsize = ap->logbufsize; + mp->m_fsname_len = strlen(ap->fsname) + 1; + + mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL); + if (!mp->m_fsname) + goto out; + + if (ap->rtname[0]) { + mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL); + if (!mp->m_rtname) + goto out_free_fsname; + + } + + if (ap->logname[0]) { + mp->m_logname = kstrdup(ap->logname, GFP_KERNEL); + if (!mp->m_logname) + goto out_free_rtname; + } + + if (ap->flags & XFSMNT_WSYNC) + mp->m_flags |= XFS_MOUNT_WSYNC; +#if XFS_BIG_INUMS + if (ap->flags & XFSMNT_INO64) { + mp->m_flags |= XFS_MOUNT_INO64; + mp->m_inoadd = XFS_INO64_OFFSET; + } +#endif + if (ap->flags & XFSMNT_RETERR) + mp->m_flags |= XFS_MOUNT_RETERR; + if (ap->flags & XFSMNT_NOALIGN) + mp->m_flags |= XFS_MOUNT_NOALIGN; + if (ap->flags & XFSMNT_SWALLOC) + mp->m_flags |= XFS_MOUNT_SWALLOC; + if (ap->flags & XFSMNT_OSYNCISOSYNC) + mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; + if (ap->flags & XFSMNT_32BITINODES) + mp->m_flags |= XFS_MOUNT_32BITINODES; + + if (ap->flags & XFSMNT_IOSIZE) { + if (ap->iosizelog > XFS_MAX_IO_LOG || + ap->iosizelog < XFS_MIN_IO_LOG) { + cmn_err(CE_WARN, + "XFS: invalid log iosize: %d [not %d-%d]", + ap->iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return XFS_ERROR(EINVAL); + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; + } + + if (ap->flags & XFSMNT_IKEEP) + mp->m_flags |= XFS_MOUNT_IKEEP; + if (ap->flags & XFSMNT_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (ap->flags & XFSMNT_ATTR2) + mp->m_flags |= XFS_MOUNT_ATTR2; + if (ap->flags & XFSMNT_NOATTR2) + mp->m_flags |= XFS_MOUNT_NOATTR2; + + if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + + /* + * no recovery flag requires a read-only mount + */ + if (ap->flags & XFSMNT_NORECOVERY) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + cmn_err(CE_WARN, + "XFS: tried to mount a FS read-write without recovery!"); + return XFS_ERROR(EINVAL); + } + mp->m_flags |= XFS_MOUNT_NORECOVERY; + } + + if (ap->flags & XFSMNT_NOUUID) + mp->m_flags |= XFS_MOUNT_NOUUID; + if (ap->flags & XFSMNT_BARRIER) + mp->m_flags |= XFS_MOUNT_BARRIER; + else + mp->m_flags &= ~XFS_MOUNT_BARRIER; + + if (ap->flags2 & XFSMNT2_FILESTREAMS) + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + + if (ap->flags & XFSMNT_DMAPI) + mp->m_flags |= XFS_MOUNT_DMAPI; + return 0; + + + out_free_rtname: + kfree(mp->m_rtname); + out_free_fsname: + kfree(mp->m_fsname); + out: + return error; +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock _has_ now been read in. + */ +STATIC int +xfs_finish_flags( + struct xfs_mount_args *ap, + struct xfs_mount *mp) +{ + int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); + + /* Fail a mount where the logbuf is smaller then the log stripe */ + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if ((ap->logbufsize <= 0) && + (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { + mp->m_logbsize = mp->m_sb.sb_logsunit; + } else if (ap->logbufsize > 0 && + ap->logbufsize < mp->m_sb.sb_logsunit) { + cmn_err(CE_WARN, + "XFS: logbuf size must be greater than or equal to log stripe size"); + return XFS_ERROR(EINVAL); + } + } else { + /* Fail a mount if the logbuf is larger than 32K */ + if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { + cmn_err(CE_WARN, + "XFS: logbuf size for version 1 logs must be 16K or 32K"); + return XFS_ERROR(EINVAL); + } + } + + /* + * mkfs'ed attr2 will turn on attr2 mount unless explicitly + * told by noattr2 to turn it off + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(ap->flags & XFSMNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + + /* + * prohibit r/w mounts of read-only filesystems + */ + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + cmn_err(CE_WARN, + "XFS: cannot mount a read-only filesystem as read-write"); + return XFS_ERROR(EROFS); + } + + /* + * check for shared mount. + */ + if (ap->flags & XFSMNT_SHARED) { + if (!xfs_sb_version_hasshared(&mp->m_sb)) + return XFS_ERROR(EINVAL); + + /* + * For IRIX 6.5, shared mounts must have the shared + * version bit set, have the persistent readonly + * field set, must be version 0 and can only be mounted + * read-only. + */ + if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) || + (mp->m_sb.sb_shared_vn != 0)) + return XFS_ERROR(EINVAL); + + mp->m_flags |= XFS_MOUNT_SHARED; + + /* + * Shared XFS V0 can't deal with DMI. Return EINVAL. + */ + if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) + return XFS_ERROR(EINVAL); + } + + if (ap->flags & XFSMNT_UQUOTA) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + if (ap->flags & XFSMNT_UQUOTAENF) + mp->m_qflags |= XFS_UQUOTA_ENFD; + } + + if (ap->flags & XFSMNT_GQUOTA) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + if (ap->flags & XFSMNT_GQUOTAENF) + mp->m_qflags |= XFS_OQUOTA_ENFD; + } else if (ap->flags & XFSMNT_PQUOTA) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + if (ap->flags & XFSMNT_PQUOTAENF) + mp->m_qflags |= XFS_OQUOTA_ENFD; + } + + return 0; +} + STATIC int xfs_fs_fill_super( struct super_block *sb, void *data, int silent) { - struct bhv_vnode *rootvp; - struct bhv_vfs *vfsp = vfs_allocate(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb, silent); - struct kstatfs statvfs; - int error; + struct inode *root; + struct xfs_mount *mp = NULL; + struct xfs_mount_args *args; + int flags = 0, error = ENOMEM; - bhv_insert_all_vfsops(vfsp); + args = xfs_args_allocate(sb, silent); + if (!args) + return -ENOMEM; - error = bhv_vfs_parseargs(vfsp, (char *)data, args, 0); - if (error) { - bhv_remove_all_vfsops(vfsp, 1); - goto fail_vfsop; - } + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); + if (!mp) + goto out_free_args; + + spin_lock_init(&mp->m_sb_lock); + mutex_init(&mp->m_ilock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); + INIT_LIST_HEAD(&mp->m_sync_list); + spin_lock_init(&mp->m_sync_lock); + init_waitqueue_head(&mp->m_wait_single_sync_task); + + mp->m_super = sb; + sb->s_fs_info = mp; + + if (sb->s_flags & MS_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + + error = xfs_parseargs(mp, (char *)data, args, 0); + if (error) + goto out_free_mp; sb_min_blocksize(sb, BBSIZE); + sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; sb->s_qcop = &xfs_quotactl_operations; sb->s_op = &xfs_super_operations; - error = bhv_vfs_mount(vfsp, args, NULL); - if (error) { - bhv_remove_all_vfsops(vfsp, 1); - goto fail_vfsop; - } + error = xfs_dmops_get(mp, args); + if (error) + goto out_free_mp; + error = xfs_qmops_get(mp, args); + if (error) + goto out_put_dmops; - error = bhv_vfs_statvfs(vfsp, &statvfs, NULL); + if (args->flags & XFSMNT_QUIET) + flags |= XFS_MFSI_QUIET; + + error = xfs_open_devices(mp, args); if (error) - goto fail_unmount; + goto out_put_qmops; + + if (xfs_icsb_init_counters(mp)) + mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; + + /* + * Setup flags based on mount(2) options and then the superblock + */ + error = xfs_start_flags(args, mp); + if (error) + goto out_free_fsname; + error = xfs_readsb(mp, flags); + if (error) + goto out_free_fsname; + error = xfs_finish_flags(args, mp); + if (error) + goto out_free_sb; + + error = xfs_setup_devices(mp); + if (error) + goto out_free_sb; + + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_mountfs_check_barriers(mp); + + error = xfs_filestream_mount(mp); + if (error) + goto out_free_sb; + + error = xfs_mountfs(mp); + if (error) + goto out_filestream_unmount; + + XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname); sb->s_dirt = 1; - sb->s_magic = statvfs.f_type; - sb->s_blocksize = statvfs.f_bsize; - sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1; + sb->s_magic = XFS_SB_MAGIC; + sb->s_blocksize = mp->m_sb.sb_blocksize; + sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = bhv_vfs_root(vfsp, &rootvp); - if (error) + root = igrab(VFS_I(mp->m_rootip)); + if (!root) { + error = ENOENT; goto fail_unmount; - - sb->s_root = d_alloc_root(vn_to_inode(rootvp)); + } + if (is_bad_inode(root)) { + error = EINVAL; + goto fail_vnrele; + } + sb->s_root = d_alloc_root(root); if (!sb->s_root) { error = ENOMEM; goto fail_vnrele; } - if (is_bad_inode(sb->s_root->d_inode)) { - error = EINVAL; + + mp->m_sync_work.w_syncer = xfs_sync_worker; + mp->m_sync_work.w_mount = mp; + mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); + if (IS_ERR(mp->m_sync_task)) { + error = -PTR_ERR(mp->m_sync_task); goto fail_vnrele; } - if ((error = xfs_fs_start_syncd(vfsp))) - goto fail_vnrele; - vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address); - kmem_free(args, sizeof(*args)); + xfs_itrace_exit(XFS_I(sb->s_root->d_inode)); + + kfree(args); return 0; -fail_vnrele: + out_filestream_unmount: + xfs_filestream_unmount(mp); + out_free_sb: + xfs_freesb(mp); + out_free_fsname: + xfs_free_fsname(mp); + xfs_icsb_destroy_counters(mp); + xfs_close_devices(mp); + out_put_qmops: + xfs_qmops_put(mp); + out_put_dmops: + xfs_dmops_put(mp); + out_free_mp: + kfree(mp); + out_free_args: + kfree(args); + return -error; + + fail_vnrele: if (sb->s_root) { dput(sb->s_root); sb->s_root = NULL; } else { - VN_RELE(rootvp); + iput(root); } -fail_unmount: - bhv_vfs_unmount(vfsp, 0, NULL); + fail_unmount: + /* + * Blow away any referenced inode in the filestreams cache. + * This can and will cause log traffic as inodes go inactive + * here. + */ + xfs_filestream_unmount(mp); -fail_vfsop: - vfs_deallocate(vfsp); - kmem_free(args, sizeof(*args)); - return -error; + XFS_bflush(mp->m_ddev_targp); + error = xfs_unmount_flush(mp, 0); + WARN_ON(error); + + xfs_unmountfs(mp); + goto out_free_sb; } STATIC int @@ -872,7 +1839,7 @@ xfs_fs_get_sb( mnt); } -STATIC struct super_operations xfs_super_operations = { +static struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .write_inode = xfs_fs_write_inode, @@ -886,7 +1853,7 @@ STATIC struct super_operations xfs_super_operations = { .show_options = xfs_fs_show_options, }; -STATIC struct quotactl_ops xfs_quotactl_operations = { +static struct quotactl_ops xfs_quotactl_operations = { .quota_sync = xfs_fs_quotasync, .get_xstate = xfs_fs_getxstate, .set_xstate = xfs_fs_setxstate, @@ -894,7 +1861,7 @@ STATIC struct quotactl_ops xfs_quotactl_operations = { .set_xquota = xfs_fs_setxquota, }; -STATIC struct file_system_type xfs_fs_type = { +static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", .get_sb = xfs_fs_get_sb, @@ -902,57 +1869,310 @@ STATIC struct file_system_type xfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +STATIC int __init +xfs_alloc_trace_bufs(void) +{ +#ifdef XFS_ALLOC_TRACE + xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_alloc_trace_buf) + goto out; +#endif +#ifdef XFS_BMAP_TRACE + xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_bmap_trace_buf) + goto out_free_alloc_trace; +#endif +#ifdef XFS_BMBT_TRACE + xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_bmbt_trace_buf) + goto out_free_bmap_trace; +#endif +#ifdef XFS_ATTR_TRACE + xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_attr_trace_buf) + goto out_free_bmbt_trace; +#endif +#ifdef XFS_DIR2_TRACE + xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL); + if (!xfs_dir2_trace_buf) + goto out_free_attr_trace; +#endif + + return 0; + +#ifdef XFS_DIR2_TRACE + out_free_attr_trace: +#endif +#ifdef XFS_ATTR_TRACE + ktrace_free(xfs_attr_trace_buf); + out_free_bmbt_trace: +#endif +#ifdef XFS_BMBT_TRACE + ktrace_free(xfs_bmbt_trace_buf); + out_free_bmap_trace: +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(xfs_bmap_trace_buf); + out_free_alloc_trace: +#endif +#ifdef XFS_ALLOC_TRACE + ktrace_free(xfs_alloc_trace_buf); + out: +#endif + return -ENOMEM; +} + +STATIC void +xfs_free_trace_bufs(void) +{ +#ifdef XFS_DIR2_TRACE + ktrace_free(xfs_dir2_trace_buf); +#endif +#ifdef XFS_ATTR_TRACE + ktrace_free(xfs_attr_trace_buf); +#endif +#ifdef XFS_BMBT_TRACE + ktrace_free(xfs_bmbt_trace_buf); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(xfs_bmap_trace_buf); +#endif +#ifdef XFS_ALLOC_TRACE + ktrace_free(xfs_alloc_trace_buf); +#endif +} + +STATIC int __init +xfs_init_zones(void) +{ + xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | + KM_ZONE_SPREAD, + xfs_fs_inode_init_once); + if (!xfs_vnode_zone) + goto out; + + xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); + if (!xfs_ioend_zone) + goto out_destroy_vnode_zone; + + xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, + xfs_ioend_zone); + if (!xfs_ioend_pool) + goto out_destroy_ioend_zone; + + xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), + "xfs_log_ticket"); + if (!xfs_log_ticket_zone) + goto out_destroy_ioend_pool; + + xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), + "xfs_bmap_free_item"); + if (!xfs_bmap_free_item_zone) + goto out_destroy_log_ticket_zone; + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), + "xfs_btree_cur"); + if (!xfs_btree_cur_zone) + goto out_destroy_bmap_free_item_zone; + + xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), + "xfs_da_state"); + if (!xfs_da_state_zone) + goto out_destroy_btree_cur_zone; + + xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); + if (!xfs_dabuf_zone) + goto out_destroy_da_state_zone; + + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + if (!xfs_ifork_zone) + goto out_destroy_dabuf_zone; + + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + if (!xfs_trans_zone) + goto out_destroy_ifork_zone; + + /* + * The size of the zone allocated buf log item is the maximum + * size possible under XFS. This wastes a little bit of memory, + * but it is much faster. + */ + xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + + (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + NBWORD) * sizeof(int))), "xfs_buf_item"); + if (!xfs_buf_item_zone) + goto out_destroy_trans_zone; + + xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efd_item"); + if (!xfs_efd_zone) + goto out_destroy_buf_item_zone; + + xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efi_item"); + if (!xfs_efi_zone) + goto out_destroy_efd_zone; + + xfs_inode_zone = + kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | + KM_ZONE_SPREAD, NULL); + if (!xfs_inode_zone) + goto out_destroy_efi_zone; + + xfs_ili_zone = + kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", + KM_ZONE_SPREAD, NULL); + if (!xfs_ili_zone) + goto out_destroy_inode_zone; + +#ifdef CONFIG_XFS_POSIX_ACL + xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl"); + if (!xfs_acl_zone) + goto out_destroy_ili_zone; +#endif + + return 0; + +#ifdef CONFIG_XFS_POSIX_ACL + out_destroy_ili_zone: +#endif + kmem_zone_destroy(xfs_ili_zone); + out_destroy_inode_zone: + kmem_zone_destroy(xfs_inode_zone); + out_destroy_efi_zone: + kmem_zone_destroy(xfs_efi_zone); + out_destroy_efd_zone: + kmem_zone_destroy(xfs_efd_zone); + out_destroy_buf_item_zone: + kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_trans_zone: + kmem_zone_destroy(xfs_trans_zone); + out_destroy_ifork_zone: + kmem_zone_destroy(xfs_ifork_zone); + out_destroy_dabuf_zone: + kmem_zone_destroy(xfs_dabuf_zone); + out_destroy_da_state_zone: + kmem_zone_destroy(xfs_da_state_zone); + out_destroy_btree_cur_zone: + kmem_zone_destroy(xfs_btree_cur_zone); + out_destroy_bmap_free_item_zone: + kmem_zone_destroy(xfs_bmap_free_item_zone); + out_destroy_log_ticket_zone: + kmem_zone_destroy(xfs_log_ticket_zone); + out_destroy_ioend_pool: + mempool_destroy(xfs_ioend_pool); + out_destroy_ioend_zone: + kmem_zone_destroy(xfs_ioend_zone); + out_destroy_vnode_zone: + kmem_zone_destroy(xfs_vnode_zone); + out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_zones(void) +{ +#ifdef CONFIG_XFS_POSIX_ACL + kmem_zone_destroy(xfs_acl_zone); +#endif + kmem_zone_destroy(xfs_ili_zone); + kmem_zone_destroy(xfs_inode_zone); + kmem_zone_destroy(xfs_efi_zone); + kmem_zone_destroy(xfs_efd_zone); + kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_trans_zone); + kmem_zone_destroy(xfs_ifork_zone); + kmem_zone_destroy(xfs_dabuf_zone); + kmem_zone_destroy(xfs_da_state_zone); + kmem_zone_destroy(xfs_btree_cur_zone); + kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_zone_destroy(xfs_log_ticket_zone); + mempool_destroy(xfs_ioend_pool); + kmem_zone_destroy(xfs_ioend_zone); + kmem_zone_destroy(xfs_vnode_zone); + +} STATIC int __init -init_xfs_fs( void ) +init_xfs_fs(void) { int error; - struct sysinfo si; static char message[] __initdata = KERN_INFO \ XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"; printk(message); - si_meminfo(&si); - xfs_physmem = si.totalram; - ktrace_init(64); + vn_init(); + xfs_dir_startup(); error = xfs_init_zones(); - if (error < 0) - goto undo_zones; + if (error) + goto out; + + error = xfs_alloc_trace_bufs(); + if (error) + goto out_destroy_zones; + + error = xfs_mru_cache_init(); + if (error) + goto out_free_trace_buffers; + + error = xfs_filestream_init(); + if (error) + goto out_mru_cache_uninit; error = xfs_buf_init(); - if (error < 0) - goto undo_buffers; + if (error) + goto out_filestream_uninit; + + error = xfs_init_procfs(); + if (error) + goto out_buf_terminate; + + error = xfs_sysctl_register(); + if (error) + goto out_cleanup_procfs; - vn_init(); - xfs_init(); - uuid_init(); vfs_initquota(); error = register_filesystem(&xfs_fs_type); if (error) - goto undo_register; + goto out_sysctl_unregister; return 0; -undo_register: + out_sysctl_unregister: + xfs_sysctl_unregister(); + out_cleanup_procfs: + xfs_cleanup_procfs(); + out_buf_terminate: xfs_buf_terminate(); - -undo_buffers: + out_filestream_uninit: + xfs_filestream_uninit(); + out_mru_cache_uninit: + xfs_mru_cache_uninit(); + out_free_trace_buffers: + xfs_free_trace_bufs(); + out_destroy_zones: xfs_destroy_zones(); - -undo_zones: + out: return error; } STATIC void __exit -exit_xfs_fs( void ) +exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); - xfs_cleanup(); + xfs_sysctl_unregister(); + xfs_cleanup_procfs(); xfs_buf_terminate(); + xfs_filestream_uninit(); + xfs_mru_cache_uninit(); + xfs_free_trace_bufs(); xfs_destroy_zones(); ktrace_uninit(); } diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 33dd1ca..fe2ef4e 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -18,6 +18,8 @@ #ifndef __XFS_SUPER_H__ #define __XFS_SUPER_H__ +#include <linux/exportfs.h> + #ifdef CONFIG_XFS_DMAPI # define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) # define vfs_initdmapi() dmapi_init() @@ -48,13 +50,7 @@ extern void xfs_qm_exit(void); # define set_posix_acl_flag(sb) do { } while (0) #endif -#ifdef CONFIG_XFS_SECURITY -# define XFS_SECURITY_STRING "security attributes, " -# define ENOSECURITY 0 -#else -# define XFS_SECURITY_STRING -# define ENOSECURITY EOPNOTSUPP -#endif +#define XFS_SECURITY_STRING "security attributes, " #ifdef CONFIG_XFS_RT # define XFS_REALTIME_STRING "realtime, " @@ -105,16 +101,14 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); -extern void xfs_initialize_vnode(bhv_desc_t *, bhv_vnode_t *, bhv_desc_t *, int); - extern void xfs_flush_inode(struct xfs_inode *); extern void xfs_flush_device(struct xfs_inode *); -extern int xfs_blkdev_get(struct xfs_mount *, const char *, - struct block_device **); -extern void xfs_blkdev_put(struct block_device *); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); -extern struct export_operations xfs_export_operations; +extern const struct export_operations xfs_export_operations; +extern struct xattr_handler *xfs_xattr_handlers[]; + +#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index af24653..7dacb5b 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -54,107 +54,222 @@ xfs_stats_clear_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC ctl_table xfs_table[] = { - {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max}, - - {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max}, - - {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max}, - - {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.panic_mask.min, &xfs_params.panic_mask.max}, - - {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.error_level.min, &xfs_params.error_level.max}, - - {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max}, - - {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max}, - - {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max}, - - {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max}, - - {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max}, - - {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max}, - - {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max}, - - {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.rotorstep.min, &xfs_params.rotorstep.max}, - - {XFS_INHERIT_NODFRG, "inherit_nodefrag", &xfs_params.inherit_nodfrg.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nodfrg.min, &xfs_params.inherit_nodfrg.max}, +static ctl_table xfs_table[] = { + { + .ctl_name = XFS_RESTRICT_CHOWN, + .procname = "restrict_chown", + .data = &xfs_params.restrict_chown.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.restrict_chown.min, + .extra2 = &xfs_params.restrict_chown.max + }, + { + .ctl_name = XFS_SGID_INHERIT, + .procname = "irix_sgid_inherit", + .data = &xfs_params.sgid_inherit.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.sgid_inherit.min, + .extra2 = &xfs_params.sgid_inherit.max + }, + { + .ctl_name = XFS_SYMLINK_MODE, + .procname = "irix_symlink_mode", + .data = &xfs_params.symlink_mode.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.symlink_mode.min, + .extra2 = &xfs_params.symlink_mode.max + }, + { + .ctl_name = XFS_PANIC_MASK, + .procname = "panic_mask", + .data = &xfs_params.panic_mask.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.panic_mask.min, + .extra2 = &xfs_params.panic_mask.max + }, + { + .ctl_name = XFS_ERRLEVEL, + .procname = "error_level", + .data = &xfs_params.error_level.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.error_level.min, + .extra2 = &xfs_params.error_level.max + }, + { + .ctl_name = XFS_SYNCD_TIMER, + .procname = "xfssyncd_centisecs", + .data = &xfs_params.syncd_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.syncd_timer.min, + .extra2 = &xfs_params.syncd_timer.max + }, + { + .ctl_name = XFS_INHERIT_SYNC, + .procname = "inherit_sync", + .data = &xfs_params.inherit_sync.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_sync.min, + .extra2 = &xfs_params.inherit_sync.max + }, + { + .ctl_name = XFS_INHERIT_NODUMP, + .procname = "inherit_nodump", + .data = &xfs_params.inherit_nodump.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_nodump.min, + .extra2 = &xfs_params.inherit_nodump.max + }, + { + .ctl_name = XFS_INHERIT_NOATIME, + .procname = "inherit_noatime", + .data = &xfs_params.inherit_noatim.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_noatim.min, + .extra2 = &xfs_params.inherit_noatim.max + }, + { + .ctl_name = XFS_BUF_TIMER, + .procname = "xfsbufd_centisecs", + .data = &xfs_params.xfs_buf_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.xfs_buf_timer.min, + .extra2 = &xfs_params.xfs_buf_timer.max + }, + { + .ctl_name = XFS_BUF_AGE, + .procname = "age_buffer_centisecs", + .data = &xfs_params.xfs_buf_age.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.xfs_buf_age.min, + .extra2 = &xfs_params.xfs_buf_age.max + }, + { + .ctl_name = XFS_INHERIT_NOSYM, + .procname = "inherit_nosymlinks", + .data = &xfs_params.inherit_nosym.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_nosym.min, + .extra2 = &xfs_params.inherit_nosym.max + }, + { + .ctl_name = XFS_ROTORSTEP, + .procname = "rotorstep", + .data = &xfs_params.rotorstep.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.rotorstep.min, + .extra2 = &xfs_params.rotorstep.max + }, + { + .ctl_name = XFS_INHERIT_NODFRG, + .procname = "inherit_nodefrag", + .data = &xfs_params.inherit_nodfrg.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_nodfrg.min, + .extra2 = &xfs_params.inherit_nodfrg.max + }, + { + .ctl_name = XFS_FILESTREAM_TIMER, + .procname = "filestream_centisecs", + .data = &xfs_params.fstrm_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.fstrm_timer.min, + .extra2 = &xfs_params.fstrm_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS - {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val, - sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler, - &sysctl_intvec, NULL, - &xfs_params.stats_clear.min, &xfs_params.stats_clear.max}, + { + .ctl_name = XFS_STATS_CLEAR, + .procname = "stats_clear", + .data = &xfs_params.stats_clear.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &xfs_stats_clear_proc_handler, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.stats_clear.min, + .extra2 = &xfs_params.stats_clear.max + }, #endif /* CONFIG_PROC_FS */ - {0} + {} }; -STATIC ctl_table xfs_dir_table[] = { - {FS_XFS, "xfs", NULL, 0, 0555, xfs_table}, - {0} +static ctl_table xfs_dir_table[] = { + { + .ctl_name = FS_XFS, + .procname = "xfs", + .mode = 0555, + .child = xfs_table + }, + {} }; -STATIC ctl_table xfs_root_table[] = { - {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table}, - {0} +static ctl_table xfs_root_table[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = xfs_dir_table + }, + {} }; -void +int xfs_sysctl_register(void) { - xfs_table_header = register_sysctl_table(xfs_root_table, 1); + xfs_table_header = register_sysctl_table(xfs_root_table); + if (!xfs_table_header) + return -ENOMEM; + return 0; } void xfs_sysctl_unregister(void) { - if (xfs_table_header) - unregister_sysctl_table(xfs_table_header); + unregister_sysctl_table(xfs_table_header); } diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h index a631fb8..4aadb80 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/linux-2.6/xfs_sysctl.h @@ -47,6 +47,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ + xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ } xfs_param_t; /* @@ -86,15 +87,16 @@ enum { XFS_INHERIT_NOSYM = 19, XFS_ROTORSTEP = 20, XFS_INHERIT_NODFRG = 21, + XFS_FILESTREAM_TIMER = 22, }; extern xfs_param_t xfs_params; #ifdef CONFIG_SYSCTL -extern void xfs_sysctl_register(void); +extern int xfs_sysctl_register(void); extern void xfs_sysctl_unregister(void); #else -# define xfs_sysctl_register() do { } while (0) +# define xfs_sysctl_register() (0) # define xfs_sysctl_unregister() do { } while (0) #endif /* CONFIG_SYSCTL */ diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c deleted file mode 100644 index 6145e8b..0000000 --- a/fs/xfs/linux-2.6/xfs_vfs.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_inum.h" -#include "xfs_log.h" -#include "xfs_clnt.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_imap.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_quota.h" - -int -vfs_mount( - struct bhv_desc *bdp, - struct xfs_mount_args *args, - struct cred *cr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_mount) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr)); -} - -int -vfs_parseargs( - struct bhv_desc *bdp, - char *s, - struct xfs_mount_args *args, - int f) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_parseargs) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f)); -} - -int -vfs_showargs( - struct bhv_desc *bdp, - struct seq_file *m) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_showargs) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_showargs)(next, m)); -} - -int -vfs_unmount( - struct bhv_desc *bdp, - int fl, - struct cred *cr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_unmount) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr)); -} - -int -vfs_mntupdate( - struct bhv_desc *bdp, - int *fl, - struct xfs_mount_args *args) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_mntupdate) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args)); -} - -int -vfs_root( - struct bhv_desc *bdp, - struct bhv_vnode **vpp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_root) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_root)(next, vpp)); -} - -int -vfs_statvfs( - struct bhv_desc *bdp, - bhv_statvfs_t *statp, - struct bhv_vnode *vp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_statvfs) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_statvfs)(next, statp, vp)); -} - -int -vfs_sync( - struct bhv_desc *bdp, - int fl, - struct cred *cr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_sync) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr)); -} - -int -vfs_vget( - struct bhv_desc *bdp, - struct bhv_vnode **vpp, - struct fid *fidp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_vget) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp)); -} - -int -vfs_dmapiops( - struct bhv_desc *bdp, - caddr_t addr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_dmapiops) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr)); -} - -int -vfs_quotactl( - struct bhv_desc *bdp, - int cmd, - int id, - caddr_t addr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_quotactl) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr)); -} - -void -vfs_init_vnode( - struct bhv_desc *bdp, - struct bhv_vnode *vp, - struct bhv_desc *bp, - int unlock) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_init_vnode) - next = BHV_NEXT(next); - ((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock)); -} - -void -vfs_force_shutdown( - struct bhv_desc *bdp, - int fl, - char *file, - int line) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_force_shutdown) - next = BHV_NEXT(next); - ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line)); -} - -void -vfs_freeze( - struct bhv_desc *bdp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_freeze) - next = BHV_NEXT(next); - ((*bhvtovfsops(next)->vfs_freeze)(next)); -} - -bhv_vfs_t * -vfs_allocate( - struct super_block *sb) -{ - struct bhv_vfs *vfsp; - - vfsp = kmem_zalloc(sizeof(bhv_vfs_t), KM_SLEEP); - bhv_head_init(VFS_BHVHEAD(vfsp), "vfs"); - INIT_LIST_HEAD(&vfsp->vfs_sync_list); - spin_lock_init(&vfsp->vfs_sync_lock); - init_waitqueue_head(&vfsp->vfs_wait_single_sync_task); - - vfsp->vfs_super = sb; - sb->s_fs_info = vfsp; - - if (sb->s_flags & MS_RDONLY) - vfsp->vfs_flag |= VFS_RDONLY; - - return vfsp; -} - -bhv_vfs_t * -vfs_from_sb( - struct super_block *sb) -{ - return (bhv_vfs_t *)sb->s_fs_info; -} - -void -vfs_deallocate( - struct bhv_vfs *vfsp) -{ - bhv_head_destroy(VFS_BHVHEAD(vfsp)); - kmem_free(vfsp, sizeof(bhv_vfs_t)); -} - -void -vfs_insertops( - struct bhv_vfs *vfsp, - struct bhv_module_vfsops *vfsops) -{ - struct bhv_desc *bdp; - - bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP); - bhv_desc_init(bdp, NULL, vfsp, vfsops); - bhv_insert(&vfsp->vfs_bh, bdp); -} - -void -vfs_insertbhv( - struct bhv_vfs *vfsp, - struct bhv_desc *bdp, - struct bhv_vfsops *vfsops, - void *mount) -{ - bhv_desc_init(bdp, mount, vfsp, vfsops); - bhv_insert_initial(&vfsp->vfs_bh, bdp); -} - -void -bhv_remove_vfsops( - struct bhv_vfs *vfsp, - int pos) -{ - struct bhv_desc *bhv; - - bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos); - if (!bhv) - return; - bhv_remove(&vfsp->vfs_bh, bhv); - kmem_free(bhv, sizeof(*bhv)); -} - -void -bhv_remove_all_vfsops( - struct bhv_vfs *vfsp, - int freebase) -{ - struct xfs_mount *mp; - - bhv_remove_vfsops(vfsp, VFS_POSITION_QM); - bhv_remove_vfsops(vfsp, VFS_POSITION_DM); - if (!freebase) - return; - mp = XFS_VFSTOM(vfsp); - VFS_REMOVEBHV(vfsp, &mp->m_bhv); - xfs_mount_free(mp, 0); -} - -void -bhv_insert_all_vfsops( - struct bhv_vfs *vfsp) -{ - struct xfs_mount *mp; - - mp = xfs_mount_init(); - vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp); - vfs_insertdmapi(vfsp); - vfs_insertquota(vfsp); -} diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index 91fc2c4..7e60c77 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -21,68 +21,25 @@ #include <linux/vfs.h> #include "xfs_fs.h" -struct bhv_vfs; -struct bhv_vnode; +struct inode; struct fid; struct cred; struct seq_file; struct super_block; +struct xfs_inode; +struct xfs_mount; struct xfs_mount_args; typedef struct kstatfs bhv_statvfs_t; typedef struct bhv_vfs_sync_work { struct list_head w_list; - struct bhv_vfs *w_vfs; + struct xfs_mount *w_mount; void *w_data; /* syncer routine argument */ - void (*w_syncer)(struct bhv_vfs *, void *); + void (*w_syncer)(struct xfs_mount *, void *); } bhv_vfs_sync_work_t; -typedef struct bhv_vfs { - u_int vfs_flag; /* flags */ - xfs_fsid_t vfs_fsid; /* file system ID */ - xfs_fsid_t *vfs_altfsid; /* An ID fixed for life of FS */ - bhv_head_t vfs_bh; /* head of vfs behavior chain */ - struct super_block *vfs_super; /* generic superblock pointer */ - struct task_struct *vfs_sync_task; /* generalised sync thread */ - bhv_vfs_sync_work_t vfs_sync_work; /* work item for VFS_SYNC */ - struct list_head vfs_sync_list; /* sync thread work item list */ - spinlock_t vfs_sync_lock; /* work item list lock */ - int vfs_sync_seq; /* sync thread generation no. */ - wait_queue_head_t vfs_wait_single_sync_task; -} bhv_vfs_t; - -#define bhvtovfs(bdp) ( (struct bhv_vfs *)BHV_VOBJ(bdp) ) -#define bhvtovfsops(bdp) ( (struct bhv_vfsops *)BHV_OPS(bdp) ) -#define VFS_BHVHEAD(vfs) ( &(vfs)->vfs_bh ) -#define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) ) - -#define VFS_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ -#define VFS_POSITION_TOP BHV_POSITION_TOP /* chain top */ -#define VFS_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ - -typedef enum { - VFS_BHV_UNKNOWN, /* not specified */ - VFS_BHV_XFS, /* xfs */ - VFS_BHV_DM, /* data migration */ - VFS_BHV_QM, /* quota manager */ - VFS_BHV_IO, /* IO path */ - VFS_BHV_END /* housekeeping end-of-range */ -} bhv_vfs_type_t; - -#define VFS_POSITION_XFS (BHV_POSITION_BASE) -#define VFS_POSITION_DM (VFS_POSITION_BASE+10) -#define VFS_POSITION_QM (VFS_POSITION_BASE+20) -#define VFS_POSITION_IO (VFS_POSITION_BASE+30) - -#define VFS_RDONLY 0x0001 /* read-only vfs */ -#define VFS_GRPID 0x0002 /* group-ID assigned from directory */ -#define VFS_DMI 0x0004 /* filesystem has the DMI enabled */ -#define VFS_UMOUNT 0x0008 /* unmount in progress */ -#define VFS_32BITINODES 0x0010 /* do not use inums above 32 bits */ -#define VFS_END 0x0010 /* max flag */ - #define SYNC_ATTR 0x0001 /* sync attributes */ #define SYNC_CLOSE 0x0002 /* close file system down */ #define SYNC_DELWRI 0x0004 /* look at delayed writes */ @@ -91,7 +48,21 @@ typedef enum { #define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ #define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ #define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ -#define SYNC_QUIESCE 0x0100 /* quiesce fileystem for a snapshot */ +#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ + +/* + * When remounting a filesystem read-only or freezing the filesystem, + * we have two phases to execute. This first phase is syncing the data + * before we quiesce the fielsystem, and the second is flushing all the + * inodes out after we've waited for all the transactions created by + * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE + * to ensure that the inodes are written to their location on disk + * rather than just existing in transactions in the log. This means + * after a quiesce there is no log replay required to write the inodes + * to disk (this is the main difference between a sync and a quiesce). + */ +#define SYNC_DATA_QUIESCE (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT) +#define SYNC_INODE_QUIESCE (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT) #define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ @@ -100,118 +71,7 @@ typedef enum { #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ -typedef int (*vfs_mount_t)(bhv_desc_t *, - struct xfs_mount_args *, struct cred *); -typedef int (*vfs_parseargs_t)(bhv_desc_t *, char *, - struct xfs_mount_args *, int); -typedef int (*vfs_showargs_t)(bhv_desc_t *, struct seq_file *); -typedef int (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *); -typedef int (*vfs_mntupdate_t)(bhv_desc_t *, int *, - struct xfs_mount_args *); -typedef int (*vfs_root_t)(bhv_desc_t *, struct bhv_vnode **); -typedef int (*vfs_statvfs_t)(bhv_desc_t *, bhv_statvfs_t *, - struct bhv_vnode *); -typedef int (*vfs_sync_t)(bhv_desc_t *, int, struct cred *); -typedef int (*vfs_vget_t)(bhv_desc_t *, struct bhv_vnode **, struct fid *); -typedef int (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t); -typedef int (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t); -typedef void (*vfs_init_vnode_t)(bhv_desc_t *, - struct bhv_vnode *, bhv_desc_t *, int); -typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int); -typedef void (*vfs_freeze_t)(bhv_desc_t *); - -typedef struct bhv_vfsops { - bhv_position_t vf_position; /* behavior chain position */ - vfs_mount_t vfs_mount; /* mount file system */ - vfs_parseargs_t vfs_parseargs; /* parse mount options */ - vfs_showargs_t vfs_showargs; /* unparse mount options */ - vfs_unmount_t vfs_unmount; /* unmount file system */ - vfs_mntupdate_t vfs_mntupdate; /* update file system options */ - vfs_root_t vfs_root; /* get root vnode */ - vfs_statvfs_t vfs_statvfs; /* file system statistics */ - vfs_sync_t vfs_sync; /* flush files */ - vfs_vget_t vfs_vget; /* get vnode from fid */ - vfs_dmapiops_t vfs_dmapiops; /* data migration */ - vfs_quotactl_t vfs_quotactl; /* disk quota */ - vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */ - vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */ - vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */ -} bhv_vfsops_t; - -/* - * Virtual filesystem operations, operating from head bhv. - */ -#define VFSHEAD(v) ((v)->vfs_bh.bh_first) -#define bhv_vfs_mount(v, ma,cr) vfs_mount(VFSHEAD(v), ma,cr) -#define bhv_vfs_parseargs(v, o,ma,f) vfs_parseargs(VFSHEAD(v), o,ma,f) -#define bhv_vfs_showargs(v, m) vfs_showargs(VFSHEAD(v), m) -#define bhv_vfs_unmount(v, f,cr) vfs_unmount(VFSHEAD(v), f,cr) -#define bhv_vfs_mntupdate(v, fl,args) vfs_mntupdate(VFSHEAD(v), fl,args) -#define bhv_vfs_root(v, vpp) vfs_root(VFSHEAD(v), vpp) -#define bhv_vfs_statvfs(v, sp,vp) vfs_statvfs(VFSHEAD(v), sp,vp) -#define bhv_vfs_sync(v, flag,cr) vfs_sync(VFSHEAD(v), flag,cr) -#define bhv_vfs_vget(v, vpp,fidp) vfs_vget(VFSHEAD(v), vpp,fidp) -#define bhv_vfs_dmapiops(v, p) vfs_dmapiops(VFSHEAD(v), p) -#define bhv_vfs_quotactl(v, c,id,p) vfs_quotactl(VFSHEAD(v), c,id,p) -#define bhv_vfs_init_vnode(v, vp,b,ul) vfs_init_vnode(VFSHEAD(v), vp,b,ul) -#define bhv_vfs_force_shutdown(v,u,f,l) vfs_force_shutdown(VFSHEAD(v), u,f,l) -#define bhv_vfs_freeze(v) vfs_freeze(VFSHEAD(v)) - -/* - * Virtual filesystem operations, operating from next bhv. - */ -#define bhv_next_vfs_mount(b, ma,cr) vfs_mount(b, ma,cr) -#define bhv_next_vfs_parseargs(b, o,ma,f) vfs_parseargs(b, o,ma,f) -#define bhv_next_vfs_showargs(b, m) vfs_showargs(b, m) -#define bhv_next_vfs_unmount(b, f,cr) vfs_unmount(b, f,cr) -#define bhv_next_vfs_mntupdate(b, fl,args) vfs_mntupdate(b, fl, args) -#define bhv_next_vfs_root(b, vpp) vfs_root(b, vpp) -#define bhv_next_vfs_statvfs(b, sp,vp) vfs_statvfs(b, sp,vp) -#define bhv_next_vfs_sync(b, flag,cr) vfs_sync(b, flag,cr) -#define bhv_next_vfs_vget(b, vpp,fidp) vfs_vget(b, vpp,fidp) -#define bhv_next_vfs_dmapiops(b, p) vfs_dmapiops(b, p) -#define bhv_next_vfs_quotactl(b, c,id,p) vfs_quotactl(b, c,id,p) -#define bhv_next_vfs_init_vnode(b, vp,b2,ul) vfs_init_vnode(b, vp,b2,ul) -#define bhv_next_force_shutdown(b, fl,f,l) vfs_force_shutdown(b, fl,f,l) -#define bhv_next_vfs_freeze(b) vfs_freeze(b) - -extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *); -extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int); -extern int vfs_showargs(bhv_desc_t *, struct seq_file *); -extern int vfs_unmount(bhv_desc_t *, int, struct cred *); -extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *); -extern int vfs_root(bhv_desc_t *, struct bhv_vnode **); -extern int vfs_statvfs(bhv_desc_t *, bhv_statvfs_t *, struct bhv_vnode *); -extern int vfs_sync(bhv_desc_t *, int, struct cred *); -extern int vfs_vget(bhv_desc_t *, struct bhv_vnode **, struct fid *); -extern int vfs_dmapiops(bhv_desc_t *, caddr_t); -extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t); -extern void vfs_init_vnode(bhv_desc_t *, struct bhv_vnode *, bhv_desc_t *, int); -extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int); -extern void vfs_freeze(bhv_desc_t *); - -#define vfs_test_for_freeze(vfs) ((vfs)->vfs_super->s_frozen) -#define vfs_wait_for_freeze(vfs,l) vfs_check_frozen((vfs)->vfs_super, (l)) - -typedef struct bhv_module_vfsops { - struct bhv_vfsops bhv_common; - void * bhv_custom; -} bhv_module_vfsops_t; - -#define vfs_bhv_lookup(v, id) (bhv_lookup_range(&(v)->vfs_bh, (id), (id))) -#define vfs_bhv_custom(b) (((bhv_module_vfsops_t*)BHV_OPS(b))->bhv_custom) -#define vfs_bhv_set_custom(b,o) ((b)->bhv_custom = (void *)(o)) -#define vfs_bhv_clr_custom(b) ((b)->bhv_custom = NULL) - -extern bhv_vfs_t *vfs_allocate(struct super_block *); -extern bhv_vfs_t *vfs_from_sb(struct super_block *); -extern void vfs_deallocate(bhv_vfs_t *); -extern void vfs_insertbhv(bhv_vfs_t *, bhv_desc_t *, bhv_vfsops_t *, void *); - -extern void vfs_insertops(bhv_vfs_t *, bhv_module_vfsops_t *); - -extern void bhv_insert_all_vfsops(struct bhv_vfs *); -extern void bhv_remove_all_vfsops(struct bhv_vfs *, int); -extern void bhv_remove_vfsops(struct bhv_vfs *, int); +#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) +#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) #endif /* __XFS_VFS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c index 553fa73..b52528b 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -16,19 +16,31 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_vnodeops.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" + +/* + * And this gunk is needed for xfs_mount.h" + */ +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dmapi.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_mount.h" -uint64_t vn_generation; /* vnode generation number */ -DEFINE_SPINLOCK(vnumber_lock); /* - * Dedicated vnode inactive/reclaim sync semaphores. + * Dedicated vnode inactive/reclaim sync wait queues. * Prime number of hash buckets since address is used as the key. */ #define NVSYNC 37 #define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) -STATIC wait_queue_head_t vsync[NVSYNC]; +static wait_queue_head_t vsync[NVSYNC]; -void +void __init vn_init(void) { int i; @@ -39,19 +51,19 @@ vn_init(void) void vn_iowait( - bhv_vnode_t *vp) + xfs_inode_t *ip) { - wait_queue_head_t *wq = vptosync(vp); + wait_queue_head_t *wq = vptosync(ip); - wait_event(*wq, (atomic_read(&vp->v_iocount) == 0)); + wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); } void vn_iowake( - bhv_vnode_t *vp) + xfs_inode_t *ip) { - if (atomic_dec_and_test(&vp->v_iocount)) - wake_up(vptosync(vp)); + if (atomic_dec_and_test(&ip->i_iocount)) + wake_up(vptosync(ip)); } /* @@ -61,143 +73,38 @@ vn_iowake( */ void vn_ioerror( - bhv_vnode_t *vp, + xfs_inode_t *ip, int error, char *f, int l) { if (unlikely(error == -ENODEV)) - bhv_vfs_force_shutdown(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l); -} - -bhv_vnode_t * -vn_initialize( - struct inode *inode) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - - XFS_STATS_INC(vn_active); - XFS_STATS_INC(vn_alloc); - - vp->v_flag = VMODIFIED; - spinlock_init(&vp->v_lock, "v_lock"); - - spin_lock(&vnumber_lock); - if (!++vn_generation) /* v_number shouldn't be zero */ - vn_generation++; - vp->v_number = vn_generation; - spin_unlock(&vnumber_lock); - - ASSERT(VN_CACHED(vp) == 0); - - /* Initialize the first behavior and the behavior chain head. */ - vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode"); - - atomic_set(&vp->v_iocount, 0); - -#ifdef XFS_VNODE_TRACE - vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP); -#endif /* XFS_VNODE_TRACE */ - - vn_trace_exit(vp, __FUNCTION__, (inst_t *)__return_address); - return vp; -} - -/* - * Revalidate the Linux inode from the vattr. - * Note: i_size _not_ updated; we must hold the inode - * semaphore when doing that - callers responsibility. - */ -void -vn_revalidate_core( - bhv_vnode_t *vp, - bhv_vattr_t *vap) -{ - struct inode *inode = vn_to_inode(vp); - - inode->i_mode = vap->va_mode; - inode->i_nlink = vap->va_nlink; - inode->i_uid = vap->va_uid; - inode->i_gid = vap->va_gid; - inode->i_blocks = vap->va_nblocks; - inode->i_mtime = vap->va_mtime; - inode->i_ctime = vap->va_ctime; - if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (vap->va_xflags & XFS_XFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (vap->va_xflags & XFS_XFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (vap->va_xflags & XFS_XFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; -} - -/* - * Revalidate the Linux inode from the vnode. - */ -int -__vn_revalidate( - bhv_vnode_t *vp, - bhv_vattr_t *vattr) -{ - int error; - - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - vattr->va_mask = XFS_AT_STAT | XFS_AT_XFLAGS; - error = bhv_vop_getattr(vp, vattr, 0, NULL); - if (likely(!error)) { - vn_revalidate_core(vp, vattr); - VUNMODIFY(vp); - } - return -error; + xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l); } -int -vn_revalidate( - bhv_vnode_t *vp) -{ - bhv_vattr_t vattr; - - return __vn_revalidate(vp, &vattr); -} +#ifdef XFS_INODE_TRACE /* - * Add a reference to a referenced vnode. + * Reference count of Linux inode if present, -1 if the xfs_inode + * has no associated Linux inode. */ -bhv_vnode_t * -vn_hold( - bhv_vnode_t *vp) +static inline int xfs_icount(struct xfs_inode *ip) { - struct inode *inode; - - XFS_STATS_INC(vn_hold); + struct inode *vp = VFS_I(ip); - VN_LOCK(vp); - inode = igrab(vn_to_inode(vp)); - ASSERT(inode); - VN_UNLOCK(vp, 0); - - return vp; + if (vp) + return vn_count(vp); + return -1; } -#ifdef XFS_VNODE_TRACE - -#define KTRACE_ENTER(vp, vk, s, line, ra) \ - ktrace_enter( (vp)->v_trace, \ +#define KTRACE_ENTER(ip, vk, s, line, ra) \ + ktrace_enter( (ip)->i_trace, \ /* 0 */ (void *)(__psint_t)(vk), \ /* 1 */ (void *)(s), \ /* 2 */ (void *)(__psint_t) line, \ -/* 3 */ (void *)(__psint_t)(vn_count(vp)), \ +/* 3 */ (void *)(__psint_t)xfs_icount(ip), \ /* 4 */ (void *)(ra), \ -/* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \ +/* 5 */ NULL, \ /* 6 */ (void *)(__psint_t)current_cpu(), \ /* 7 */ (void *)(__psint_t)current_pid(), \ /* 8 */ (void *)__return_address, \ @@ -207,32 +114,32 @@ vn_hold( * Vnode tracing code. */ void -vn_trace_entry(bhv_vnode_t *vp, const char *func, inst_t *ra) +_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra) { - KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra); + KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra); } void -vn_trace_exit(bhv_vnode_t *vp, const char *func, inst_t *ra) +_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra) { - KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra); + KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra); } void -vn_trace_hold(bhv_vnode_t *vp, char *file, int line, inst_t *ra) +xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra) { - KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra); + KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra); } void -vn_trace_ref(bhv_vnode_t *vp, char *file, int line, inst_t *ra) +_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra) { - KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra); + KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra); } void -vn_trace_rele(bhv_vnode_t *vp, char *file, int line, inst_t *ra) +xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra) { - KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra); + KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra); } -#endif /* XFS_VNODE_TRACE */ +#endif /* XFS_INODE_TRACE */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index c42b322..683ce16 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -18,102 +18,12 @@ #ifndef __XFS_VNODE_H__ #define __XFS_VNODE_H__ -struct uio; struct file; -struct bhv_vfs; -struct bhv_vattr; struct xfs_iomap; struct attrlist_cursor_kern; -typedef struct dentry bhv_vname_t; -typedef __u64 bhv_vnumber_t; - -typedef enum bhv_vflags { - VMODIFIED = 0x08, /* XFS inode state possibly differs */ - /* to the Linux inode state. */ - VTRUNCATED = 0x40, /* truncated down so flush-on-close */ -} bhv_vflags_t; - -/* - * MP locking protocols: - * v_flag, v_vfsp VN_LOCK/VN_UNLOCK - */ -typedef struct bhv_vnode { - bhv_vflags_t v_flag; /* vnode flags (see above) */ - bhv_vfs_t *v_vfsp; /* ptr to containing VFS */ - bhv_vnumber_t v_number; /* in-core vnode number */ - bhv_head_t v_bh; /* behavior head */ - spinlock_t v_lock; /* VN_LOCK/VN_UNLOCK */ - atomic_t v_iocount; /* outstanding I/O count */ -#ifdef XFS_VNODE_TRACE - struct ktrace *v_trace; /* trace header structure */ -#endif - struct inode v_inode; /* Linux inode */ - /* inode MUST be last */ -} bhv_vnode_t; - -#define VN_ISLNK(vp) S_ISLNK((vp)->v_inode.i_mode) -#define VN_ISREG(vp) S_ISREG((vp)->v_inode.i_mode) -#define VN_ISDIR(vp) S_ISDIR((vp)->v_inode.i_mode) -#define VN_ISCHR(vp) S_ISCHR((vp)->v_inode.i_mode) -#define VN_ISBLK(vp) S_ISBLK((vp)->v_inode.i_mode) - -#define VNODE_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ -#define VNODE_POSITION_TOP BHV_POSITION_TOP /* chain top */ -#define VNODE_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ - -typedef enum { - VN_BHV_UNKNOWN, /* not specified */ - VN_BHV_XFS, /* xfs */ - VN_BHV_DM, /* data migration */ - VN_BHV_QM, /* quota manager */ - VN_BHV_IO, /* IO path */ - VN_BHV_END /* housekeeping end-of-range */ -} vn_bhv_t; - -#define VNODE_POSITION_XFS (VNODE_POSITION_BASE) -#define VNODE_POSITION_DM (VNODE_POSITION_BASE+10) -#define VNODE_POSITION_QM (VNODE_POSITION_BASE+20) -#define VNODE_POSITION_IO (VNODE_POSITION_BASE+30) - /* - * Macros for dealing with the behavior descriptor inside of the vnode. - */ -#define BHV_TO_VNODE(bdp) ((bhv_vnode_t *)BHV_VOBJ(bdp)) -#define BHV_TO_VNODE_NULL(bdp) ((bhv_vnode_t *)BHV_VOBJNULL(bdp)) - -#define VN_BHV_HEAD(vp) ((bhv_head_t *)(&((vp)->v_bh))) -#define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name) -#define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp) -#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops) -#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops) - -/* - * Vnode to Linux inode mapping. - */ -static inline struct bhv_vnode *vn_from_inode(struct inode *inode) -{ - return container_of(inode, bhv_vnode_t, v_inode); -} -static inline struct inode *vn_to_inode(struct bhv_vnode *vnode) -{ - return &vnode->v_inode; -} - -/* - * Values for the vop_rwlock/rwunlock flags parameter. - */ -typedef enum bhv_vrwlock { - VRWLOCK_NONE, - VRWLOCK_READ, - VRWLOCK_WRITE, - VRWLOCK_WRITE_DIRECT, - VRWLOCK_TRY_READ, - VRWLOCK_TRY_WRITE -} bhv_vrwlock_t; - -/* - * Return values for bhv_vop_inactive. A return value of + * Return values for xfs_inactive. A return value of * VN_INACTIVE_NOCACHE implies that the file system behavior * has disassociated its state and bhv_desc_t from the vnode. */ @@ -121,204 +31,6 @@ typedef enum bhv_vrwlock { #define VN_INACTIVE_NOCACHE 1 /* - * Values for the cmd code given to vop_vnode_change. - */ -typedef enum bhv_vchange { - VCHANGE_FLAGS_FRLOCKS = 0, - VCHANGE_FLAGS_ENF_LOCKING = 1, - VCHANGE_FLAGS_TRUNCATED = 2, - VCHANGE_FLAGS_PAGE_DIRTY = 3, - VCHANGE_FLAGS_IOEXCL_COUNT = 4 -} bhv_vchange_t; - -typedef enum { L_FALSE, L_TRUE } lastclose_t; - -typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); -typedef int (*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *); -typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, - loff_t *, int, size_t, read_actor_t, - void *, struct cred *); -typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, - struct pipe_inode_info *, size_t, int, int, - struct cred *); -typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, - struct file *, loff_t *, size_t, int, int, - struct cred *); -typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, - int, unsigned int, void __user *); -typedef int (*vop_getattr_t)(bhv_desc_t *, struct bhv_vattr *, int, - struct cred *); -typedef int (*vop_setattr_t)(bhv_desc_t *, struct bhv_vattr *, int, - struct cred *); -typedef int (*vop_access_t)(bhv_desc_t *, int, struct cred *); -typedef int (*vop_lookup_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t **, - int, bhv_vnode_t *, struct cred *); -typedef int (*vop_create_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *, - bhv_vnode_t **, struct cred *); -typedef int (*vop_remove_t)(bhv_desc_t *, bhv_vname_t *, struct cred *); -typedef int (*vop_link_t)(bhv_desc_t *, bhv_vnode_t *, bhv_vname_t *, - struct cred *); -typedef int (*vop_rename_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *, - bhv_vname_t *, struct cred *); -typedef int (*vop_mkdir_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *, - bhv_vnode_t **, struct cred *); -typedef int (*vop_rmdir_t)(bhv_desc_t *, bhv_vname_t *, struct cred *); -typedef int (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *, - int *); -typedef int (*vop_symlink_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr*, - char *, bhv_vnode_t **, struct cred *); -typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, int, - struct cred *); -typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *, - xfs_off_t, xfs_off_t); -typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *); -typedef int (*vop_fid2_t)(bhv_desc_t *, struct fid *); -typedef int (*vop_release_t)(bhv_desc_t *); -typedef int (*vop_rwlock_t)(bhv_desc_t *, bhv_vrwlock_t); -typedef void (*vop_rwunlock_t)(bhv_desc_t *, bhv_vrwlock_t); -typedef int (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -typedef int (*vop_reclaim_t)(bhv_desc_t *); -typedef int (*vop_attr_get_t)(bhv_desc_t *, const char *, char *, int *, - int, struct cred *); -typedef int (*vop_attr_set_t)(bhv_desc_t *, const char *, char *, int, - int, struct cred *); -typedef int (*vop_attr_remove_t)(bhv_desc_t *, const char *, - int, struct cred *); -typedef int (*vop_attr_list_t)(bhv_desc_t *, char *, int, int, - struct attrlist_cursor_kern *, struct cred *); -typedef void (*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int); -typedef void (*vop_vnode_change_t)(bhv_desc_t *, bhv_vchange_t, __psint_t); -typedef void (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -typedef void (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -typedef int (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, - uint64_t, int); -typedef int (*vop_iflush_t)(bhv_desc_t *, int); - - -typedef struct bhv_vnodeops { - bhv_position_t vn_position; /* position within behavior chain */ - vop_open_t vop_open; - vop_close_t vop_close; - vop_read_t vop_read; - vop_write_t vop_write; - vop_sendfile_t vop_sendfile; - vop_splice_read_t vop_splice_read; - vop_splice_write_t vop_splice_write; - vop_ioctl_t vop_ioctl; - vop_getattr_t vop_getattr; - vop_setattr_t vop_setattr; - vop_access_t vop_access; - vop_lookup_t vop_lookup; - vop_create_t vop_create; - vop_remove_t vop_remove; - vop_link_t vop_link; - vop_rename_t vop_rename; - vop_mkdir_t vop_mkdir; - vop_rmdir_t vop_rmdir; - vop_readdir_t vop_readdir; - vop_symlink_t vop_symlink; - vop_readlink_t vop_readlink; - vop_fsync_t vop_fsync; - vop_inactive_t vop_inactive; - vop_fid2_t vop_fid2; - vop_rwlock_t vop_rwlock; - vop_rwunlock_t vop_rwunlock; - vop_bmap_t vop_bmap; - vop_reclaim_t vop_reclaim; - vop_attr_get_t vop_attr_get; - vop_attr_set_t vop_attr_set; - vop_attr_remove_t vop_attr_remove; - vop_attr_list_t vop_attr_list; - vop_link_removed_t vop_link_removed; - vop_vnode_change_t vop_vnode_change; - vop_ptossvp_t vop_tosspages; - vop_pflushinvalvp_t vop_flushinval_pages; - vop_pflushvp_t vop_flush_pages; - vop_release_t vop_release; - vop_iflush_t vop_iflush; -} bhv_vnodeops_t; - -/* - * Virtual node operations, operating from head bhv. - */ -#define VNHEAD(vp) ((vp)->v_bh.bh_first) -#define VOP(op, vp) (*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op) -#define bhv_vop_open(vp, cr) VOP(vop_open, vp)(VNHEAD(vp),cr) -#define bhv_vop_close(vp, f,last,cr) VOP(vop_close, vp)(VNHEAD(vp),f,last,cr) -#define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr) \ - VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) -#define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr) \ - VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) -#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr) \ - VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr) -#define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr) \ - VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr) -#define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr) \ - VOP(vop_splice_write, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr) -#define bhv_vop_bmap(vp,of,sz,rw,b,n) \ - VOP(vop_bmap, vp)(VNHEAD(vp),of,sz,rw,b,n) -#define bhv_vop_getattr(vp, vap,f,cr) \ - VOP(vop_getattr, vp)(VNHEAD(vp), vap,f,cr) -#define bhv_vop_setattr(vp, vap,f,cr) \ - VOP(vop_setattr, vp)(VNHEAD(vp), vap,f,cr) -#define bhv_vop_access(vp, mode,cr) VOP(vop_access, vp)(VNHEAD(vp), mode,cr) -#define bhv_vop_lookup(vp,d,vpp,f,rdir,cr) \ - VOP(vop_lookup, vp)(VNHEAD(vp),d,vpp,f,rdir,cr) -#define bhv_vop_create(dvp,d,vap,vpp,cr) \ - VOP(vop_create, dvp)(VNHEAD(dvp),d,vap,vpp,cr) -#define bhv_vop_remove(dvp,d,cr) VOP(vop_remove, dvp)(VNHEAD(dvp),d,cr) -#define bhv_vop_link(dvp,fvp,d,cr) VOP(vop_link, dvp)(VNHEAD(dvp),fvp,d,cr) -#define bhv_vop_rename(fvp,fnm,tdvp,tnm,cr) \ - VOP(vop_rename, fvp)(VNHEAD(fvp),fnm,tdvp,tnm,cr) -#define bhv_vop_mkdir(dp,d,vap,vpp,cr) \ - VOP(vop_mkdir, dp)(VNHEAD(dp),d,vap,vpp,cr) -#define bhv_vop_rmdir(dp,d,cr) VOP(vop_rmdir, dp)(VNHEAD(dp),d,cr) -#define bhv_vop_readdir(vp,uiop,cr,eofp) \ - VOP(vop_readdir, vp)(VNHEAD(vp),uiop,cr,eofp) -#define bhv_vop_symlink(dvp,d,vap,tnm,vpp,cr) \ - VOP(vop_symlink, dvp)(VNHEAD(dvp),d,vap,tnm,vpp,cr) -#define bhv_vop_readlink(vp,uiop,fl,cr) \ - VOP(vop_readlink, vp)(VNHEAD(vp),uiop,fl,cr) -#define bhv_vop_fsync(vp,f,cr,b,e) VOP(vop_fsync, vp)(VNHEAD(vp),f,cr,b,e) -#define bhv_vop_inactive(vp,cr) VOP(vop_inactive, vp)(VNHEAD(vp),cr) -#define bhv_vop_release(vp) VOP(vop_release, vp)(VNHEAD(vp)) -#define bhv_vop_fid2(vp,fidp) VOP(vop_fid2, vp)(VNHEAD(vp),fidp) -#define bhv_vop_rwlock(vp,i) VOP(vop_rwlock, vp)(VNHEAD(vp),i) -#define bhv_vop_rwlock_try(vp,i) VOP(vop_rwlock, vp)(VNHEAD(vp),i) -#define bhv_vop_rwunlock(vp,i) VOP(vop_rwunlock, vp)(VNHEAD(vp),i) -#define bhv_vop_frlock(vp,c,fl,flags,offset,fr) \ - VOP(vop_frlock, vp)(VNHEAD(vp),c,fl,flags,offset,fr) -#define bhv_vop_reclaim(vp) VOP(vop_reclaim, vp)(VNHEAD(vp)) -#define bhv_vop_attr_get(vp, name, val, vallenp, fl, cred) \ - VOP(vop_attr_get, vp)(VNHEAD(vp),name,val,vallenp,fl,cred) -#define bhv_vop_attr_set(vp, name, val, vallen, fl, cred) \ - VOP(vop_attr_set, vp)(VNHEAD(vp),name,val,vallen,fl,cred) -#define bhv_vop_attr_remove(vp, name, flags, cred) \ - VOP(vop_attr_remove, vp)(VNHEAD(vp),name,flags,cred) -#define bhv_vop_attr_list(vp, buf, buflen, fl, cursor, cred) \ - VOP(vop_attr_list, vp)(VNHEAD(vp),buf,buflen,fl,cursor,cred) -#define bhv_vop_link_removed(vp, dvp, linkzero) \ - VOP(vop_link_removed, vp)(VNHEAD(vp), dvp, linkzero) -#define bhv_vop_vnode_change(vp, cmd, val) \ - VOP(vop_vnode_change, vp)(VNHEAD(vp), cmd, val) -#define bhv_vop_toss_pages(vp, first, last, fiopt) \ - VOP(vop_tosspages, vp)(VNHEAD(vp), first, last, fiopt) -#define bhv_vop_flushinval_pages(vp, first, last, fiopt) \ - VOP(vop_flushinval_pages, vp)(VNHEAD(vp),first,last,fiopt) -#define bhv_vop_flush_pages(vp, first, last, flags, fiopt) \ - VOP(vop_flush_pages, vp)(VNHEAD(vp),first,last,flags,fiopt) -#define bhv_vop_ioctl(vp, inode, filp, fl, cmd, arg) \ - VOP(vop_ioctl, vp)(VNHEAD(vp),inode,filp,fl,cmd,arg) -#define bhv_vop_iflush(vp, flags) VOP(vop_iflush, vp)(VNHEAD(vp), flags) - -/* * Flags for read/write calls - same values as IRIX */ #define IO_ISAIO 0x00001 /* don't wait for completion */ @@ -326,12 +38,9 @@ typedef struct bhv_vnodeops { #define IO_INVIS 0x00020 /* don't update inode timestamps */ /* - * Flags for vop_iflush call + * Flags for xfs_inode_flush */ #define FLUSH_SYNC 1 /* wait for flush to complete */ -#define FLUSH_INODE 2 /* flush the inode itself */ -#define FLUSH_LOG 4 /* force the last log entry for - * this inode out to disk */ /* * Flush/Invalidate options for vop_toss/flush/flushinval_pages. @@ -342,264 +51,109 @@ typedef struct bhv_vnodeops { Prevent VM access to the pages until the operation completes. */ -/* - * Vnode attributes. va_mask indicates those attributes the caller - * wants to set or extract. - */ -typedef struct bhv_vattr { - int va_mask; /* bit-mask of attributes present */ - mode_t va_mode; /* file access mode and type */ - xfs_nlink_t va_nlink; /* number of references to file */ - uid_t va_uid; /* owner user id */ - gid_t va_gid; /* owner group id */ - xfs_ino_t va_nodeid; /* file id */ - xfs_off_t va_size; /* file size in bytes */ - u_long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ - u_int va_gen; /* generation number of file */ - xfs_dev_t va_rdev; /* device the special file represents */ - __int64_t va_nblocks; /* number of blocks allocated */ - u_long va_xflags; /* random extended file flags */ - u_long va_extsize; /* file extent size */ - u_long va_nextents; /* number of extents in file */ - u_long va_anextents; /* number of attr extents in file */ - prid_t va_projid; /* project id */ -} bhv_vattr_t; - -/* - * setattr or getattr attributes - */ -#define XFS_AT_TYPE 0x00000001 -#define XFS_AT_MODE 0x00000002 -#define XFS_AT_UID 0x00000004 -#define XFS_AT_GID 0x00000008 -#define XFS_AT_FSID 0x00000010 -#define XFS_AT_NODEID 0x00000020 -#define XFS_AT_NLINK 0x00000040 -#define XFS_AT_SIZE 0x00000080 -#define XFS_AT_ATIME 0x00000100 -#define XFS_AT_MTIME 0x00000200 -#define XFS_AT_CTIME 0x00000400 -#define XFS_AT_RDEV 0x00000800 -#define XFS_AT_BLKSIZE 0x00001000 -#define XFS_AT_NBLOCKS 0x00002000 -#define XFS_AT_VCODE 0x00004000 -#define XFS_AT_MAC 0x00008000 -#define XFS_AT_UPDATIME 0x00010000 -#define XFS_AT_UPDMTIME 0x00020000 -#define XFS_AT_UPDCTIME 0x00040000 -#define XFS_AT_ACL 0x00080000 -#define XFS_AT_CAP 0x00100000 -#define XFS_AT_INF 0x00200000 -#define XFS_AT_XFLAGS 0x00400000 -#define XFS_AT_EXTSIZE 0x00800000 -#define XFS_AT_NEXTENTS 0x01000000 -#define XFS_AT_ANEXTENTS 0x02000000 -#define XFS_AT_PROJID 0x04000000 -#define XFS_AT_SIZE_NOPERM 0x08000000 -#define XFS_AT_GENCOUNT 0x10000000 - -#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ - XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ - XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ - XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\ - XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\ - XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT) - -#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ - XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ - XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ - XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID) - -#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME) - -#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME) - -#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\ - XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\ - XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT) - -/* - * Modes. - */ -#define VSUID S_ISUID /* set user id on execution */ -#define VSGID S_ISGID /* set group id on execution */ -#define VSVTX S_ISVTX /* save swapped text even after use */ -#define VREAD S_IRUSR /* read, write, execute permissions */ -#define VWRITE S_IWUSR -#define VEXEC S_IXUSR - -#define MODEMASK S_IALLUGO /* mode bits plus permission bits */ - -/* - * Check whether mandatory file locking is enabled. - */ -#define MANDLOCK(vp, mode) \ - (VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID) extern void vn_init(void); -extern bhv_vnode_t *vn_initialize(struct inode *); -extern int vn_revalidate(struct bhv_vnode *); -extern int __vn_revalidate(struct bhv_vnode *, bhv_vattr_t *); -extern void vn_revalidate_core(struct bhv_vnode *, bhv_vattr_t *); - -extern void vn_iowait(struct bhv_vnode *vp); -extern void vn_iowake(struct bhv_vnode *vp); - -extern void vn_ioerror(struct bhv_vnode *vp, int error, char *f, int l); - -static inline int vn_count(struct bhv_vnode *vp) -{ - return atomic_read(&vn_to_inode(vp)->i_count); -} /* - * Vnode reference counting functions (and macros for compatibility). + * Yeah, these don't take vnode anymore at all, all this should be + * cleaned up at some point. */ -extern bhv_vnode_t *vn_hold(struct bhv_vnode *); - -#if defined(XFS_VNODE_TRACE) -#define VN_HOLD(vp) \ - ((void)vn_hold(vp), \ - vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address)) -#define VN_RELE(vp) \ - (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \ - iput(vn_to_inode(vp))) -#else -#define VN_HOLD(vp) ((void)vn_hold(vp)) -#define VN_RELE(vp) (iput(vn_to_inode(vp))) -#endif +extern void vn_iowait(struct xfs_inode *ip); +extern void vn_iowake(struct xfs_inode *ip); +extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l); -static inline struct bhv_vnode *vn_grab(struct bhv_vnode *vp) +static inline int vn_count(struct inode *vp) { - struct inode *inode = igrab(vn_to_inode(vp)); - return inode ? vn_from_inode(inode) : NULL; + return atomic_read(&vp->i_count); } -/* - * Vname handling macros. - */ -#define VNAME(dentry) ((char *) (dentry)->d_name.name) -#define VNAMELEN(dentry) ((dentry)->d_name.len) -#define VNAME_TO_VNODE(dentry) (vn_from_inode((dentry)->d_inode)) - -/* - * Vnode spinlock manipulation. - */ -#define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock) -#define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s) +#define IHOLD(ip) \ +do { \ + ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ + atomic_inc(&(VFS_I(ip)->i_count)); \ + xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ +} while (0) -static __inline__ void vn_flagset(struct bhv_vnode *vp, uint flag) -{ - spin_lock(&vp->v_lock); - vp->v_flag |= flag; - spin_unlock(&vp->v_lock); -} +#define IRELE(ip) \ +do { \ + xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ + iput(VFS_I(ip)); \ +} while (0) -static __inline__ uint vn_flagclr(struct bhv_vnode *vp, uint flag) +static inline struct inode *vn_grab(struct inode *vp) { - uint cleared; - - spin_lock(&vp->v_lock); - cleared = (vp->v_flag & flag); - vp->v_flag &= ~flag; - spin_unlock(&vp->v_lock); - return cleared; + return igrab(vp); } -#define VMODIFY(vp) vn_flagset(vp, VMODIFIED) -#define VUNMODIFY(vp) vn_flagclr(vp, VMODIFIED) -#define VTRUNCATE(vp) vn_flagset(vp, VTRUNCATED) -#define VUNTRUNCATE(vp) vn_flagclr(vp, VTRUNCATED) - /* * Dealing with bad inodes */ -static inline void vn_mark_bad(struct bhv_vnode *vp) +static inline int VN_BAD(struct inode *vp) { - make_bad_inode(vn_to_inode(vp)); -} - -static inline int VN_BAD(struct bhv_vnode *vp) -{ - return is_bad_inode(vn_to_inode(vp)); + return is_bad_inode(vp); } /* * Extracting atime values in various formats */ -static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime) +static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime) { - bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec; - bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec; + bs_atime->tv_sec = vp->i_atime.tv_sec; + bs_atime->tv_nsec = vp->i_atime.tv_nsec; } -static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts) +static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts) { - *ts = vp->v_inode.i_atime; + *ts = vp->i_atime; } -static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt) +static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt) { - *tt = vp->v_inode.i_atime.tv_sec; + *tt = vp->i_atime.tv_sec; } /* * Some useful predicates. */ -#define VN_MAPPED(vp) mapping_mapped(vn_to_inode(vp)->i_mapping) -#define VN_CACHED(vp) (vn_to_inode(vp)->i_mapping->nrpages) -#define VN_DIRTY(vp) mapping_tagged(vn_to_inode(vp)->i_mapping, \ +#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) +#define VN_CACHED(vp) (vp->i_mapping->nrpages) +#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \ PAGECACHE_TAG_DIRTY) -#define VN_TRUNC(vp) ((vp)->v_flag & VTRUNCATED) -/* - * Flags to vop_setattr/getattr. - */ -#define ATTR_UTIME 0x01 /* non-default utime(2) request */ -#define ATTR_DMI 0x08 /* invocation from a DMI function */ -#define ATTR_LAZY 0x80 /* set/get attributes lazily */ -#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */ -#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */ -#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */ - -/* - * Flags to vop_fsync/reclaim. - */ -#define FSYNC_NOWAIT 0 /* asynchronous flush */ -#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */ -#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */ -#define FSYNC_DATA 0x4 /* synchronous fsync of data only */ /* * Tracking vnode activity. */ -#if defined(XFS_VNODE_TRACE) - -#define VNODE_TRACE_SIZE 16 /* number of trace entries */ -#define VNODE_KTRACE_ENTRY 1 -#define VNODE_KTRACE_EXIT 2 -#define VNODE_KTRACE_HOLD 3 -#define VNODE_KTRACE_REF 4 -#define VNODE_KTRACE_RELE 5 - -extern void vn_trace_entry(struct bhv_vnode *, const char *, inst_t *); -extern void vn_trace_exit(struct bhv_vnode *, const char *, inst_t *); -extern void vn_trace_hold(struct bhv_vnode *, char *, int, inst_t *); -extern void vn_trace_ref(struct bhv_vnode *, char *, int, inst_t *); -extern void vn_trace_rele(struct bhv_vnode *, char *, int, inst_t *); +#if defined(XFS_INODE_TRACE) + +#define INODE_TRACE_SIZE 16 /* number of trace entries */ +#define INODE_KTRACE_ENTRY 1 +#define INODE_KTRACE_EXIT 2 +#define INODE_KTRACE_HOLD 3 +#define INODE_KTRACE_REF 4 +#define INODE_KTRACE_RELE 5 + +extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *); +extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *); +extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *); +extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *); +extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *); +#define xfs_itrace_entry(ip) \ + _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address) +#define xfs_itrace_exit(ip) \ + _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address) +#define xfs_itrace_exit_tag(ip, tag) \ + _xfs_itrace_exit(ip, tag, (inst_t *)__return_address) +#define xfs_itrace_ref(ip) \ + _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address) -#define VN_TRACE(vp) \ - vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address) #else -#define vn_trace_entry(a,b,c) -#define vn_trace_exit(a,b,c) -#define vn_trace_hold(a,b,c,d) -#define vn_trace_ref(a,b,c,d) -#define vn_trace_rele(a,b,c,d) -#define VN_TRACE(vp) +#define xfs_itrace_entry(a) +#define xfs_itrace_exit(a) +#define xfs_itrace_exit_tag(a, b) +#define xfs_itrace_hold(a, b, c, d) +#define xfs_itrace_ref(a) +#define xfs_itrace_rele(a, b, c, d) #endif #endif /* __XFS_VNODE_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c new file mode 100644 index 0000000..964621f --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -0,0 +1,330 @@ +/* + * Copyright (C) 2008 Christoph Hellwig. + * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_acl.h" +#include "xfs_vnodeops.h" + +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + + +/* + * ACL handling. Should eventually be moved into xfs_acl.c + */ + +static int +xfs_decode_acl(const char *name) +{ + if (strcmp(name, "posix_acl_access") == 0) + return _ACL_TYPE_ACCESS; + else if (strcmp(name, "posix_acl_default") == 0) + return _ACL_TYPE_DEFAULT; + return -EINVAL; +} + +/* + * Get system extended attributes which at the moment only + * includes Posix ACLs. + */ +static int +xfs_xattr_system_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + int acl; + + acl = xfs_decode_acl(name); + if (acl < 0) + return acl; + + return xfs_acl_vget(inode, buffer, size, acl); +} + +static int +xfs_xattr_system_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int acl; + + acl = xfs_decode_acl(name); + if (acl < 0) + return acl; + if (flags & XATTR_CREATE) + return -EINVAL; + + if (!value) + return xfs_acl_vremove(inode, acl); + + return xfs_acl_vset(inode, (void *)value, size, acl); +} + +static struct xattr_handler xfs_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .get = xfs_xattr_system_get, + .set = xfs_xattr_system_set, +}; + + +/* + * Real xattr handling. The only difference between the namespaces is + * a flag passed to the low-level attr code. + */ + +static int +__xfs_xattr_get(struct inode *inode, const char *name, + void *value, size_t size, int xflags) +{ + struct xfs_inode *ip = XFS_I(inode); + int error, asize = size; + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + value = NULL; + } + + error = -xfs_attr_get(ip, name, value, &asize, xflags); + if (error) + return error; + return asize; +} + +static int +__xfs_xattr_set(struct inode *inode, const char *name, const void *value, + size_t size, int flags, int xflags) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + + if (!value) + return -xfs_attr_remove(ip, name, xflags); + return -xfs_attr_set(ip, name, (void *)value, size, xflags); +} + +static int +xfs_xattr_user_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return __xfs_xattr_get(inode, name, value, size, 0); +} + +static int +xfs_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return __xfs_xattr_set(inode, name, value, size, flags, 0); +} + +static struct xattr_handler xfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = xfs_xattr_user_get, + .set = xfs_xattr_user_set, +}; + + +static int +xfs_xattr_trusted_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT); +} + +static int +xfs_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT); +} + +static struct xattr_handler xfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = xfs_xattr_trusted_get, + .set = xfs_xattr_trusted_set, +}; + + +static int +xfs_xattr_secure_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE); +} + +static int +xfs_xattr_secure_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE); +} + +static struct xattr_handler xfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = xfs_xattr_secure_get, + .set = xfs_xattr_secure_set, +}; + + +struct xattr_handler *xfs_xattr_handlers[] = { + &xfs_xattr_user_handler, + &xfs_xattr_trusted_handler, + &xfs_xattr_security_handler, + &xfs_xattr_system_handler, + NULL +}; + +static unsigned int xfs_xattr_prefix_len(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return sizeof("security"); + else if (flags & XFS_ATTR_ROOT) + return sizeof("trusted"); + else + return sizeof("user"); +} + +static const char *xfs_xattr_prefix(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return xfs_xattr_security_handler.prefix; + else if (flags & XFS_ATTR_ROOT) + return xfs_xattr_trusted_handler.prefix; + else + return xfs_xattr_user_handler.prefix; +} + +static int +xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, + char *name, int namelen, int valuelen, char *value) +{ + unsigned int prefix_len = xfs_xattr_prefix_len(flags); + char *offset; + int arraytop; + + ASSERT(context->count >= 0); + + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) + return 0; + + arraytop = context->count + prefix_len + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return 1; + } + offset = (char *)context->alist + context->count; + strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + offset += prefix_len; + strncpy(offset, name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += prefix_len + namelen + 1; + return 0; +} + +static int +xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, + char *name, int namelen, int valuelen, char *value) +{ + context->count += xfs_xattr_prefix_len(flags) + namelen + 1; + return 0; +} + +static int +list_one_attr(const char *name, const size_t len, void *data, + size_t size, ssize_t *result) +{ + char *p = data + *result; + + *result += len; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct xfs_attr_list_context context; + struct attrlist_cursor_kern cursor = { 0 }; + struct inode *inode = dentry->d_inode; + int error; + + /* + * First read the regular on-disk attributes. + */ + memset(&context, 0, sizeof(context)); + context.dp = XFS_I(inode); + context.cursor = &cursor; + context.resynch = 1; + context.alist = data; + context.bufsize = size; + context.firstu = context.bufsize; + + if (size) + context.put_listent = xfs_xattr_put_listent; + else + context.put_listent = xfs_xattr_put_listent_sizes; + + xfs_attr_list_int(&context); + if (context.count < 0) + return -ERANGE; + + /* + * Then add the two synthetic ACL attributes. + */ + if (xfs_acl_vhasacl_access(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS) + 1, + data, size, &context.count); + if (error) + return error; + } + + if (xfs_acl_vhasacl_default(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT) + 1, + data, size, &context.count); + if (error) + return error; + } + + return context.count; +} diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 3aa7715..f2705f2 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -103,11 +101,18 @@ xfs_qm_dqinit( if (brandnewdquot) { dqp->dq_flnext = dqp->dq_flprev = dqp; mutex_init(&dqp->q_qlock); - initnsema(&dqp->q_flock, 1, "fdq"); sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + #ifdef XFS_DQUOT_TRACE - dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP); + dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS); xfs_dqtrace_entry(dqp, "DQINIT"); #endif } else { @@ -152,7 +157,6 @@ xfs_qm_dqdestroy( ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); mutex_destroy(&dqp->q_qlock); - freesema(&dqp->q_flock); sv_destroy(&dqp->q_pinwait); #ifdef XFS_DQUOT_TRACE @@ -433,7 +437,7 @@ xfs_qm_dqalloc( * when it unlocks the inode. Since we want to keep the quota * inode around, we bump the vnode ref count now. */ - VN_HOLD(XFS_ITOV(quotip)); + IHOLD(quotip); xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; @@ -484,7 +488,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - if ((error = xfs_bmap_finish(tpp, &flist, firstblock, &committed))) { + if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { goto error1; } @@ -755,8 +759,7 @@ xfs_qm_idtodq( goto error0; } if (tp) { - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, - NULL))) + if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) goto error1; } @@ -936,7 +939,7 @@ xfs_qm_dqget( type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); if (ip) { - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (type == XFS_DQ_USER) ASSERT(ip->i_udquot == NULL); else @@ -1091,7 +1094,7 @@ xfs_qm_dqget( xfs_qm_mplist_unlock(mp); XFS_DQ_HASH_UNLOCK(h); dqret: - ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); xfs_dqtrace_entry(dqp, "DQGET DONE"); *O_dqpp = dqp; return (0); @@ -1212,10 +1215,9 @@ xfs_qm_dqflush( xfs_buf_t *bp; xfs_disk_dquot_t *ddqp; int error; - SPLDECL(s); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); xfs_dqtrace_entry(dqp, "DQFLUSH"); /* @@ -1273,9 +1275,9 @@ xfs_qm_dqflush( mp = dqp->q_mount; /* lsn is 64 bits */ - AIL_LOCK(mp, s); + spin_lock(&mp->m_ail_lock); dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn; - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); /* * Attach an iodone routine so that we can remove this dquot from the @@ -1295,7 +1297,7 @@ xfs_qm_dqflush( if (flags & XFS_QMOPT_DELWRI) { xfs_bdwrite(mp, bp); } else if (flags & XFS_QMOPT_ASYNC) { - xfs_bawrite(mp, bp); + error = xfs_bawrite(mp, bp); } else { error = xfs_bwrite(mp, bp); } @@ -1321,7 +1323,6 @@ xfs_qm_dqflush_done( xfs_dq_logitem_t *qip) { xfs_dquot_t *dqp; - SPLDECL(s); dqp = qip->qli_dquot; @@ -1336,15 +1337,15 @@ xfs_qm_dqflush_done( if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && qip->qli_item.li_lsn == qip->qli_flush_lsn) { - AIL_LOCK(dqp->q_mount, s); + spin_lock(&dqp->q_mount->m_ail_lock); /* * xfs_trans_delete_ail() drops the AIL lock. */ if (qip->qli_item.li_lsn == qip->qli_flush_lsn) xfs_trans_delete_ail(dqp->q_mount, - (xfs_log_item_t*)qip, s); + (xfs_log_item_t*)qip); else - AIL_UNLOCK(dqp->q_mount, s); + spin_unlock(&dqp->q_mount->m_ail_lock); } /* @@ -1353,34 +1354,18 @@ xfs_qm_dqflush_done( xfs_dqfunlock(dqp); } - -int -xfs_qm_dqflock_nowait( - xfs_dquot_t *dqp) -{ - int locked; - - locked = cpsema(&((dqp)->q_flock)); - - /* XXX ifdef these out */ - if (locked) - (dqp)->dq_flags |= XFS_DQ_FLOCKED; - return (locked); -} - - int xfs_qm_dqlock_nowait( xfs_dquot_t *dqp) { - return (mutex_trylock(&((dqp)->q_qlock))); + return mutex_trylock(&dqp->q_qlock); } void xfs_dqlock( xfs_dquot_t *dqp) { - mutex_lock(&(dqp->q_qlock)); + mutex_lock(&dqp->q_qlock); } void @@ -1440,13 +1425,10 @@ xfs_dqlock2( /* ARGSUSED */ int xfs_qm_dqpurge( - xfs_dquot_t *dqp, - uint flags) + xfs_dquot_t *dqp) { xfs_dqhash_t *thishash; - xfs_mount_t *mp; - - mp = dqp->q_mount; + xfs_mount_t *mp = dqp->q_mount; ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); @@ -1476,7 +1458,7 @@ xfs_qm_dqpurge( * if we're turning off quotas. Basically, we need this flush * lock, and are willing to block on it. */ - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { /* * Block on the flush lock after nudging dquot buffer, * if it is incore. @@ -1490,6 +1472,7 @@ xfs_qm_dqpurge( * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { + int error; xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY"); /* dqflush unlocks dqflock */ /* @@ -1500,7 +1483,10 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dqpurge: dquot %p flush failed", dqp); xfs_dqflock(dqp); } ASSERT(dqp->q_pincount == 0); @@ -1585,12 +1571,18 @@ xfs_qm_dqflock_pushbuf_wait( XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { + int error; if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); } - xfs_bawrite(dqp->q_mount, bp); + error = xfs_bawrite(dqp->q_mount, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dqflock_pushbuf_wait: " + "pushbuf error %d on dqp %p, bp %p", + error, dqp, bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index 78d3ab9..8958d0f 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -82,7 +82,7 @@ typedef struct xfs_dquot { xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ mutex_t q_qlock; /* quota lock */ - sema_t q_flock; /* flush lock */ + struct completion q_flush; /* flush completion queue */ uint q_pincount; /* pin count for this dquot */ sv_t q_pinwait; /* sync var for pinning */ #ifdef XFS_DQUOT_TRACE @@ -113,22 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) /* - * The following three routines simply manage the q_flock - * semaphore embedded in the dquot. This semaphore synchronizes - * processes attempting to flush the in-core dquot back to disk. + * Manage the q_flush completion queue embedded in the dquot. This completion + * queue synchronizes processes attempting to flush the in-core dquot back to + * disk. */ -#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\ - (dqp)->dq_flags |= XFS_DQ_FLOCKED; } -#define xfs_dqfunlock(dqp) { ASSERT(issemalocked(&((dqp)->q_flock))); \ - vsema(&((dqp)->q_flock)); \ - (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); } - -#define XFS_DQ_PINLOCK(dqp) mutex_spinlock( \ - &(XFS_DQ_TO_QINF(dqp)->qi_pinlock)) -#define XFS_DQ_PINUNLOCK(dqp, s) mutex_spinunlock( \ - &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s) - -#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock))) +static inline void xfs_dqflock(xfs_dquot_t *dqp) +{ + wait_for_completion(&dqp->q_flush); +} + +static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp) +{ + return try_wait_for_completion(&dqp->q_flush); +} + +static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +{ + complete(&dqp->q_flush); +} + #define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) @@ -169,10 +172,9 @@ extern void xfs_qm_dqprint(xfs_dquot_t *); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern int xfs_qm_dqpurge(xfs_dquot_t *, uint); +extern int xfs_qm_dqpurge(xfs_dquot_t *); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); -extern int xfs_qm_dqflock_nowait(xfs_dquot_t *); extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 5b2dcc5..f028644 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -96,14 +94,13 @@ STATIC void xfs_qm_dquot_logitem_pin( xfs_dq_logitem_t *logitem) { - unsigned long s; xfs_dquot_t *dqp; dqp = logitem->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - s = XFS_DQ_PINLOCK(dqp); + spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); dqp->q_pincount++; - XFS_DQ_PINUNLOCK(dqp, s); + spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); } /* @@ -117,17 +114,16 @@ xfs_qm_dquot_logitem_unpin( xfs_dq_logitem_t *logitem, int stale) { - unsigned long s; xfs_dquot_t *dqp; dqp = logitem->qli_dquot; ASSERT(dqp->q_pincount > 0); - s = XFS_DQ_PINLOCK(dqp); + spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); dqp->q_pincount--; if (dqp->q_pincount == 0) { sv_broadcast(&dqp->q_pinwait); } - XFS_DQ_PINUNLOCK(dqp, s); + spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); } /* ARGSUSED */ @@ -150,11 +146,12 @@ xfs_qm_dquot_logitem_push( xfs_dq_logitem_t *logitem) { xfs_dquot_t *dqp; + int error; dqp = logitem->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); /* * Since we were able to lock the dquot's flush lock and @@ -165,7 +162,11 @@ xfs_qm_dquot_logitem_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the dquot. */ - xfs_qm_dqflush(dqp, XFS_B_DELWRI); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + if (error) + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dquot_logitem_push: push error %d on dqp %p", + error, dqp); xfs_dqunlock(dqp); } @@ -191,8 +192,6 @@ void xfs_qm_dqunpin_wait( xfs_dquot_t *dqp) { - SPLDECL(s); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); if (dqp->q_pincount == 0) { return; @@ -202,9 +201,9 @@ xfs_qm_dqunpin_wait( * Give the log a push so we don't wait here too long. */ xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); - s = XFS_DQ_PINLOCK(dqp); + spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); if (dqp->q_pincount == 0) { - XFS_DQ_PINUNLOCK(dqp, s); + spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); return; } sv_wait(&(dqp->q_pinwait), PINOD, @@ -218,8 +217,8 @@ xfs_qm_dqunpin_wait( * If so, we want to push it out to help us take this item off the AIL as soon * as possible. * - * We must not be holding the AIL_LOCK at this point. Calling incore() to - * search the buffer cache can be a time consuming thing, and AIL_LOCK is a + * We must not be holding the AIL lock at this point. Calling incore() to + * search the buffer cache can be a time consuming thing, and AIL lock is a * spinlock. */ STATIC void @@ -246,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf( * inode flush completed and the inode was taken off the AIL. * So, just get out. */ - if (!issemalocked(&(dqp->q_flock)) || + if (completion_done(&dqp->q_flush) || ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); @@ -259,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf( if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - issemalocked(&(dqp->q_flock))); + !completion_done(&dqp->q_flush)); qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); @@ -268,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf( XFS_LOG_FORCE); } if (dopush) { + int error; #ifdef XFSRACEDEBUG delay_for_intr(); delay(300); #endif - xfs_bawrite(mp, bp); + error = xfs_bawrite(mp, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p", + error, qip, bp); } else { xfs_buf_relse(bp); } @@ -313,7 +317,7 @@ xfs_qm_dquot_logitem_trylock( return (XFS_ITEM_LOCKED); retval = XFS_ITEM_SUCCESS; - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { /* * The dquot is already being flushed. It may have been * flushed delayed write, however, and we don't want to @@ -324,7 +328,7 @@ xfs_qm_dquot_logitem_trylock( * want to do that now since we might sleep in the device * strategy routine. We also don't want to grab the buffer lock * here because we'd like not to call into the buffer cache - * while holding the AIL_LOCK. + * while holding the AIL lock. * Make sure to only return PUSHBUF if we set pushbuf_flag * ourselves. If someone else is doing it then we don't * want to go to the push routine and duplicate their efforts. @@ -382,18 +386,6 @@ xfs_qm_dquot_logitem_unlock( /* - * The transaction with the dquot locked has aborted. The dquot - * must not be dirty within the transaction. We simply unlock just - * as if the transaction had been cancelled. - */ -STATIC void -xfs_qm_dquot_logitem_abort( - xfs_dq_logitem_t *ql) -{ - xfs_qm_dquot_logitem_unlock(ql); -} - -/* * this needs to stamp an lsn into the dquot, I think. * rpc's that look at user dquot's would then have to * push on the dependency recorded in the dquot @@ -411,7 +403,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -STATIC struct xfs_item_ops xfs_dquot_item_ops = { +static struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_dquot_logitem_format, @@ -426,7 +418,6 @@ STATIC struct xfs_item_ops xfs_dquot_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_dquot_logitem_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort, .iop_pushbuf = (void(*)(xfs_log_item_t*)) xfs_qm_dquot_logitem_pushbuf, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) @@ -559,17 +550,6 @@ xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) } /* - * The transaction of which this QUOTAOFF is a part has been aborted. - * Just clean up after ourselves. - * Shouldn't this never happen in the case of qoffend logitems? XXX - */ -STATIC void -xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf) -{ - kmem_free(qf, sizeof(xfs_qoff_logitem_t)); -} - -/* * There isn't much you can do to push on an quotaoff item. It is simply * stuck waiting for the log to be flushed to disk. */ @@ -588,17 +568,16 @@ xfs_qm_qoffend_logitem_committed( xfs_lsn_t lsn) { xfs_qoff_logitem_t *qfs; - SPLDECL(s); qfs = qfe->qql_start_lip; - AIL_LOCK(qfs->qql_item.li_mountp,s); + spin_lock(&qfs->qql_item.li_mountp->m_ail_lock); /* * Delete the qoff-start logitem from the AIL. * xfs_trans_delete_ail() drops the AIL lock. */ - xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s); - kmem_free(qfs, sizeof(xfs_qoff_logitem_t)); - kmem_free(qfe, sizeof(xfs_qoff_logitem_t)); + xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs); + kmem_free(qfs); + kmem_free(qfe); return (xfs_lsn_t)-1; } @@ -630,7 +609,7 @@ xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) return; } -STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, @@ -644,7 +623,6 @@ STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoffend_logitem_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoffend_logitem_committing @@ -653,7 +631,7 @@ STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, @@ -667,7 +645,6 @@ STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoff_logitem_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoff_logitem_committing diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index e23e455..df0ffef 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -44,8 +44,6 @@ #include "xfs_bmap.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -64,10 +62,8 @@ uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; -STATIC kmem_shaker_t xfs_qm_shaker; -STATIC cred_t xfs_zerocr; -STATIC xfs_inode_t xfs_zeroino; +static cred_t xfs_zerocr; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); @@ -81,6 +77,11 @@ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(int, gfp_t); +static struct shrinker xfs_qm_shaker = { + .shrink = xfs_qm_shake, + .seeks = DEFAULT_SEEKS, +}; + #ifdef DEBUG extern mutex_t qcheck_lock; #endif @@ -112,17 +113,18 @@ xfs_Gqm_init(void) { xfs_dqhash_t *udqhash, *gdqhash; xfs_qm_t *xqm; - uint i, hsize, flags = KM_SLEEP | KM_MAYFAIL; + size_t hsize; + uint i; /* * Initialize the dquot hash tables. */ - hsize = XFS_QM_HASHSIZE_HIGH; - while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) { - if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW) - flags = KM_SLEEP; - } - gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP); + udqhash = kmem_zalloc_greedy(&hsize, + XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), + XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t), + KM_SLEEP | KM_MAYFAIL | KM_LARGE); + gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); + hsize /= sizeof(xfs_dqhash_t); ndquot = hsize << 8; xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); @@ -152,7 +154,7 @@ xfs_Gqm_init(void) } else xqm->qm_dqzone = qm_dqzone; - xfs_qm_shaker = kmem_shake_register(xfs_qm_shake); + register_shrinker(&xfs_qm_shaker); /* * The t_dqinfo portion of transactions. @@ -184,14 +186,14 @@ xfs_qm_destroy( ASSERT(xqm != NULL); ASSERT(xqm->qm_nrefs == 0); - kmem_shake_deregister(xfs_qm_shaker); + unregister_shrinker(&xfs_qm_shaker); hsize = xqm->qm_dqhashmask + 1; for (i = 0; i < hsize; i++) { xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); } - kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t)); - kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t)); + kmem_free(xqm->qm_usr_dqhtable); + kmem_free(xqm->qm_grp_dqhtable); xqm->qm_usr_dqhtable = NULL; xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; @@ -199,7 +201,7 @@ xfs_qm_destroy( #ifdef DEBUG mutex_destroy(&qcheck_lock); #endif - kmem_free(xqm, sizeof(xfs_qm_t)); + kmem_free(xqm); } /* @@ -286,45 +288,6 @@ xfs_qm_rele_quotafs_ref( } /* - * This is called at mount time from xfs_mountfs to initialize the quotainfo - * structure and start the global quota manager (xfs_Gqm) if it hasn't done - * so already. Note that the superblock has not been read in yet. - */ -void -xfs_qm_mount_quotainit( - xfs_mount_t *mp, - uint flags) -{ - /* - * User, projects or group quotas has to be on. - */ - ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)); - - /* - * Initialize the flags in the mount structure. From this point - * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc. - * Note that we enforce nothing if accounting is off. - * ie. XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF. - * It isn't necessary to take the quotaoff lock to do this; this is - * called from mount. - */ - if (flags & XFSMNT_UQUOTA) { - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - if (flags & XFSMNT_UQUOTAENF) - mp->m_qflags |= XFS_UQUOTA_ENFD; - } - if (flags & XFSMNT_GQUOTA) { - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - if (flags & XFSMNT_GQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } else if (flags & XFSMNT_PQUOTA) { - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - if (flags & XFSMNT_PQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } -} - -/* * Just destroy the quotainfo structure. */ void @@ -341,17 +304,17 @@ xfs_qm_unmount_quotadestroy( * necessary data structures like quotainfo. This is also responsible for * running a quotacheck as necessary. We are guaranteed that the superblock * is consistently read in at this point. + * + * If we fail here, the mount will continue with quota turned off. We don't + * need to inidicate success or failure at all. */ -int +void xfs_qm_mount_quotas( - xfs_mount_t *mp, - int mfsi_flags) + xfs_mount_t *mp) { - unsigned long s; int error = 0; uint sbf; - /* * If quotas on realtime volumes is not supported, we disable * quotas immediately. @@ -370,7 +333,8 @@ xfs_qm_mount_quotas( * Allocate the quotainfo structure inside the mount struct, and * create quotainode(s), and change/rev superblock if necessary. */ - if ((error = xfs_qm_init_quotainfo(mp))) { + error = xfs_qm_init_quotainfo(mp); + if (error) { /* * We must turn off quotas. */ @@ -381,25 +345,32 @@ xfs_qm_mount_quotas( /* * If any of the quotas are not consistent, do a quotacheck. */ - if (XFS_QM_NEED_QUOTACHECK(mp) && - !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) { - if ((error = xfs_qm_quotacheck(mp))) { - /* Quotacheck has failed and quotas have - * been disabled. - */ - return XFS_ERROR(error); + if (XFS_QM_NEED_QUOTACHECK(mp)) { + error = xfs_qm_quotacheck(mp); + if (error) { + /* Quotacheck failed and disabled quotas. */ + return; } } + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + if (!XFS_IS_UQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_UQUOTA_CHKD; + if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) + mp->m_qflags &= ~XFS_OQUOTA_CHKD; write_changes: /* - * We actually don't have to acquire the SB_LOCK at all. + * We actually don't have to acquire the m_sb_lock at all. * This can only be called from mount, and that's single threaded. XXX */ - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); sbf = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { @@ -419,7 +390,7 @@ xfs_qm_mount_quotas( xfs_fs_cmn_err(CE_WARN, mp, "Failed to initialize disk quotas."); } - return XFS_ERROR(error); + return; } /* @@ -472,11 +443,11 @@ xfs_qm_unmount_quotas( } } if (uqp) { - XFS_PURGE_INODE(uqp); + IRELE(uqp); mp->m_quotainfo->qi_uquotaip = NULL; } if (gqp) { - XFS_PURGE_INODE(gqp); + IRELE(gqp); mp->m_quotainfo->qi_gquotaip = NULL; } out: @@ -511,7 +482,7 @@ again: xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY"); /* XXX a sentinel would be better */ recl = XFS_QI_MPLRECLAIMS(mp); - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { /* * If we can't grab the flush lock then check * to see if the dquot has been flushed delayed @@ -658,7 +629,7 @@ xfs_qm_dqpurge_int( * freelist in INACTIVE state. */ nextdqp = dqp->MPL_NEXT; - nmisses += xfs_qm_dqpurge(dqp, flags); + nmisses += xfs_qm_dqpurge(dqp); dqp = nextdqp; } xfs_qm_mplist_unlock(mp); @@ -697,7 +668,7 @@ xfs_qm_dqattach_one( xfs_dquot_t *dqp; int error; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; /* * See if we already have it in the inode itself. IO_idqpp is @@ -901,7 +872,7 @@ xfs_qm_dqattach( return 0; ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 || - XFS_ISLOCKED_INODE_EXCL(ip)); + xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (! (flags & XFS_QMOPT_ILOCKED)) xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -915,7 +886,8 @@ xfs_qm_dqattach( goto done; nquotas++; } - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_OQUOTA_ON(mp)) { error = XFS_IS_GQUOTA_ON(mp) ? xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, @@ -940,7 +912,7 @@ xfs_qm_dqattach( * This WON'T, in general, result in a thrash. */ if (nquotas == 2) { - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_udquot); ASSERT(ip->i_gdquot); @@ -983,7 +955,7 @@ xfs_qm_dqattach( #ifdef QUOTADEBUG else - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); #endif return error; } @@ -1026,7 +998,7 @@ xfs_qm_dqdetach( int xfs_qm_sync( xfs_mount_t *mp, - short flags) + int flags) { int recl, restarts; xfs_dquot_t *dqp; @@ -1034,6 +1006,9 @@ xfs_qm_sync( boolean_t nowait; int error; + if (! XFS_IS_QUOTA_ON(mp)) + return 0; + restarts = 0; /* * We won't block unless we are asked to. @@ -1085,7 +1060,7 @@ xfs_qm_sync( /* XXX a sentinel would be better */ recl = XFS_QI_MPLRECLAIMS(mp); - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { if (nowait) { xfs_dqunlock(dqp); continue; @@ -1157,12 +1132,12 @@ xfs_qm_init_quotainfo( * and change the superblock accordingly. */ if ((error = xfs_qm_init_quotainos(mp))) { - kmem_free(qinf, sizeof(xfs_quotainfo_t)); + kmem_free(qinf); mp->m_quotainfo = NULL; return error; } - spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin"); + spin_lock_init(&qinf->qi_pinlock); xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); qinf->qi_dqreclaims = 0; @@ -1263,15 +1238,15 @@ xfs_qm_destroy_quotainfo( xfs_qm_list_destroy(&qi->qi_dqlist); if (qi->qi_uquotaip) { - XFS_PURGE_INODE(qi->qi_uquotaip); + IRELE(qi->qi_uquotaip); qi->qi_uquotaip = NULL; /* paranoia */ } if (qi->qi_gquotaip) { - XFS_PURGE_INODE(qi->qi_gquotaip); + IRELE(qi->qi_gquotaip); qi->qi_gquotaip = NULL; } mutex_destroy(&qi->qi_quotaofflock); - kmem_free(qi, sizeof(xfs_quotainfo_t)); + kmem_free(qi); mp->m_quotainfo = NULL; } @@ -1315,7 +1290,7 @@ xfs_qm_dqget_noattach( xfs_mount_t *mp; xfs_dquot_t *udqp, *gdqp; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); mp = ip->i_mount; udqp = NULL; gdqp = NULL; @@ -1393,7 +1368,6 @@ xfs_qm_qino_alloc( { xfs_trans_t *tp; int error; - unsigned long s; int committed; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); @@ -1406,7 +1380,7 @@ xfs_qm_qino_alloc( return error; } - if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0, + if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, &xfs_zerocr, 0, 1, ip, &committed))) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); @@ -1417,26 +1391,26 @@ xfs_qm_qino_alloc( * Keep an extra reference to this quota inode. This inode is * locked exclusively and joined to the transaction already. */ - ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip)); - VN_HOLD(XFS_ITOV((*ip))); + ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL)); + IHOLD(*ip); /* * Make the changes in the superblock, and log those too. * sbfields arg may contain fields other than *QUOTINO; * VERSIONNUM for example. */ - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) unsigned oldv = mp->m_sb.sb_versionnum; #endif - ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb)); + ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); - XFS_SB_VERSION_ADDQUOTA(&mp->m_sb); + xfs_sb_version_addquota(&mp->m_sb); mp->m_sb.sb_uquotino = NULLFSINO; mp->m_sb.sb_gquotino = NULLFSINO; @@ -1452,11 +1426,10 @@ xfs_qm_qino_alloc( mp->m_sb.sb_uquotino = (*ip)->i_ino; else mp->m_sb.sb_gquotino = (*ip)->i_ino; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, sbfields); - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, - NULL))) { + if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); return error; } @@ -1464,7 +1437,7 @@ xfs_qm_qino_alloc( } -STATIC int +STATIC void xfs_qm_reset_dqcounts( xfs_mount_t *mp, xfs_buf_t *bp, @@ -1504,8 +1477,6 @@ xfs_qm_reset_dqcounts( ddq->d_rtbwarns = 0; ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); } - - return 0; } STATIC int @@ -1546,7 +1517,7 @@ xfs_qm_dqiter_bufs( if (error) break; - (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type); + xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_bdwrite(mp, bp); /* * goto the next block. @@ -1650,7 +1621,7 @@ xfs_qm_dqiterate( break; } while (nmaps > 0); - kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map)); + kmem_free(map); return error; } @@ -1674,14 +1645,14 @@ xfs_qm_quotacheck_dqadjust( * Adjust the inode count and the block count to reflect this inode's * resource usage. */ - be64_add(&dqp->q_core.d_icount, 1); + be64_add_cpu(&dqp->q_core.d_icount, 1); dqp->q_res_icount++; if (nblks) { - be64_add(&dqp->q_core.d_bcount, nblks); + be64_add_cpu(&dqp->q_core.d_bcount, nblks); dqp->q_res_bcount += nblks; } if (rtblks) { - be64_add(&dqp->q_core.d_rtbcount, rtblks); + be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); dqp->q_res_rtbcount += rtblks; } @@ -1705,7 +1676,6 @@ xfs_qm_get_rtblks( xfs_extnum_t idx; /* extent record index */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_extnum_t nextents; /* number of extent entries */ - xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ int error; ASSERT(XFS_IS_REALTIME_INODE(ip)); @@ -1716,10 +1686,8 @@ xfs_qm_get_rtblks( } rtblks = 0; nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (idx = 0; idx < nextents; idx++) { - ep = xfs_iext_get_ext(ifp, idx); - rtblks += xfs_bmbt_get_blockcount(ep); - } + for (idx = 0; idx < nextents; idx++) + rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); *O_rtblks = (xfs_qcnt_t)rtblks; return 0; } @@ -1768,12 +1736,6 @@ xfs_qm_dqusage_adjust( return error; } - if (ip->i_d.di_mode == 0) { - xfs_iput_new(ip, XFS_ILOCK_EXCL); - *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(ENOENT); - } - /* * Obtain the locked dquots. In case of an error (eg. allocation * fails for ENOSPC), we return the negative of the error number @@ -1839,7 +1801,7 @@ xfs_qm_dqusage_adjust( * Now release the inode. This will send it to 'inactive', and * possibly even free blocks. */ - VN_RELE(XFS_ITOV(ip)); + IRELE(ip); /* * Goto next inode. @@ -1909,6 +1871,14 @@ xfs_qm_quotacheck( } while (! done); /* + * We've made all the changes that we need to make incore. + * Flush them down to disk buffers if everything was updated + * successfully. + */ + if (!error) + error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); + + /* * We can get this error if we couldn't do a dquot allocation inside * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the * dirty dquots that might be cached, we just want to get rid of them @@ -1919,11 +1889,6 @@ xfs_qm_quotacheck( xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); goto error_return; } - /* - * We've made all the changes that we need to make incore. - * Now flush_them down to disk buffers. - */ - xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); /* * We didn't log anything, because if we crashed, we'll have to @@ -1955,7 +1920,10 @@ xfs_qm_quotacheck( ASSERT(mp->m_quotainfo != NULL); ASSERT(xfs_Gqm != NULL); xfs_qm_destroy_quotainfo(mp); - (void)xfs_mount_reset_sbqflags(mp); + if (xfs_mount_reset_sbqflags(mp)) { + cmn_err(CE_WARN, "XFS quotacheck %s: " + "Failed to reset quota flags.", mp->m_fsname); + } } else { cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); } @@ -1983,7 +1951,7 @@ xfs_qm_init_quotainos( /* * Get the uquota and gquota inodes */ - if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) { + if (xfs_sb_version_hasquota(&mp->m_sb)) { if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); @@ -1997,7 +1965,7 @@ xfs_qm_init_quotainos( if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip, 0))) { if (uip) - VN_RELE(XFS_ITOV(uip)); + IRELE(uip); return XFS_ERROR(error); } } @@ -2028,7 +1996,7 @@ xfs_qm_init_quotainos( sbflags | XFS_SB_GQUOTINO, flags); if (error) { if (uip) - VN_RELE(XFS_ITOV(uip)); + IRELE(uip); return XFS_ERROR(error); } @@ -2109,7 +2077,7 @@ xfs_qm_shake_freelist( * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); dqp = dqp->dq_flnext; continue; @@ -2122,12 +2090,17 @@ xfs_qm_shake_freelist( * dirty dquots. */ if (XFS_DQ_IS_DIRTY(dqp)) { + int error; xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY"); /* * We flush it delayed write, so don't bother * releasing the mplock. */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + if (error) { + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dqflush_all: dquot %p flush failed", dqp); + } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ dqp = dqp->dq_flnext; continue; @@ -2282,7 +2255,7 @@ xfs_qm_dqreclaim_one(void) * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ - if (! xfs_qm_dqflock_nowait(dqp)) { + if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); continue; } @@ -2294,12 +2267,17 @@ xfs_qm_dqreclaim_one(void) * dirty dquots. */ if (XFS_DQ_IS_DIRTY(dqp)) { + int error; xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY"); /* * We flush it delayed write, so don't bother * releasing the freelist lock. */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + if (error) { + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dqreclaim: dquot %p flush failed", dqp); + } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ continue; } @@ -2407,9 +2385,9 @@ xfs_qm_write_sb_changes( } xfs_mod_sb(tp, flags); - (void) xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); - return 0; + return error; } @@ -2447,8 +2425,7 @@ xfs_qm_vop_dqalloc( lockflags = XFS_ILOCK_EXCL; xfs_ilock(ip, lockflags); - if ((flags & XFS_QMOPT_INHERIT) && - XFS_INHERIT_GID(ip, XFS_MTOVFS(mp))) + if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) gid = ip->i_d.di_gid; /* @@ -2579,7 +2556,7 @@ xfs_qm_vop_chown( uint bfield = XFS_IS_REALTIME_INODE(ip) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); /* old dquot */ @@ -2623,7 +2600,7 @@ xfs_qm_vop_chown_reserve( uint delblks, blkflags, prjflags = 0; xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; - ASSERT(XFS_ISLOCKED_INODE(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); mp = ip->i_mount; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -2733,7 +2710,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode( if (!XFS_IS_QUOTA_ON(tp->t_mountp)) return; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); if (udqp) { diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 4568deb..44f2534 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -52,14 +52,8 @@ extern kmem_zone_t *qm_dqtrxzone; /* * Dquot hashtable constants/threshold values. */ -#define XFS_QM_HASHSIZE_LOW (NBPP / sizeof(xfs_dqhash_t)) -#define XFS_QM_HASHSIZE_HIGH ((NBPP * 4) / sizeof(xfs_dqhash_t)) - -/* - * We output a cmn_err when quotachecking a quota file with more than - * this many fsbs. - */ -#define XFS_QM_BIG_QCHECK_NBLKS 500 +#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t)) +#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t)) /* * This defines the unit of allocation of dquots. @@ -112,7 +106,7 @@ typedef struct xfs_qm { typedef struct xfs_quotainfo { xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - lock_t qi_pinlock; /* dquot pinning mutex */ + spinlock_t qi_pinlock; /* dquot pinning lock */ xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ int qi_dqreclaims; /* a change here indicates a removal in the dqlist */ @@ -171,13 +165,12 @@ typedef struct xfs_dquot_acct { #define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); -extern int xfs_qm_mount_quotas(xfs_mount_t *, int); -extern void xfs_qm_mount_quotainit(xfs_mount_t *, uint); +extern void xfs_qm_mount_quotas(xfs_mount_t *); extern int xfs_qm_quotacheck(xfs_mount_t *); extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); extern int xfs_qm_unmount_quotas(xfs_mount_t *); extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); -extern int xfs_qm_sync(xfs_mount_t *, short); +extern int xfs_qm_sync(xfs_mount_t *, int); /* dquot stuff */ extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); @@ -205,7 +198,8 @@ extern void xfs_qm_freelist_unlink(xfs_dquot_t *); extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *); /* system call interface */ -extern int xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t); +extern int xfs_qm_quotactl(struct xfs_mount *, int, int, + xfs_caddr_t); #ifdef DEBUG extern int xfs_qm_internalqcheck(xfs_mount_t *); diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index db8872b..eea2e60 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -44,178 +44,17 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" -#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ -#define MNTOPT_NOQUOTA "noquota" /* no quotas */ -#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ -#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ -#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ -#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ -#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ -#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ -#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ -#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ -#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ -#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -STATIC int -xfs_qm_parseargs( - struct bhv_desc *bhv, - char *options, - struct xfs_mount_args *args, - int update) -{ - size_t length; - char *local_options = options; - char *this_char; - int error; - int referenced = update; - - while ((this_char = strsep(&local_options, ",")) != NULL) { - length = strlen(this_char); - if (local_options) - length++; - - if (!strcmp(this_char, MNTOPT_NOQUOTA)) { - args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA); - args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA); - referenced = update; - } else if (!strcmp(this_char, MNTOPT_QUOTA) || - !strcmp(this_char, MNTOPT_UQUOTA) || - !strcmp(this_char, MNTOPT_USRQUOTA)) { - args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || - !strcmp(this_char, MNTOPT_UQUOTANOENF)) { - args->flags |= XFSMNT_UQUOTA; - args->flags &= ~XFSMNT_UQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_PQUOTA) || - !strcmp(this_char, MNTOPT_PRJQUOTA)) { - args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { - args->flags |= XFSMNT_PQUOTA; - args->flags &= ~XFSMNT_PQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_GQUOTA) || - !strcmp(this_char, MNTOPT_GRPQUOTA)) { - args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { - args->flags |= XFSMNT_GQUOTA; - args->flags &= ~XFSMNT_GQUOTAENF; - referenced = 1; - } else { - if (local_options) - *(local_options-1) = ','; - continue; - } - - while (length--) - *this_char++ = ','; - } - - if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) { - cmn_err(CE_WARN, - "XFS: cannot mount with both project and group quota"); - return XFS_ERROR(EINVAL); - } - - error = bhv_next_vfs_parseargs(BHV_NEXT(bhv), options, args, update); - if (!error && !referenced) - bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM); - return error; -} - -STATIC int -xfs_qm_showargs( - struct bhv_desc *bhv, - struct seq_file *m) -{ - struct bhv_vfs *vfsp = bhvtovfs(bhv); - struct xfs_mount *mp = XFS_VFSTOM(vfsp); - - if (mp->m_qflags & XFS_UQUOTA_ACCT) { - (mp->m_qflags & XFS_UQUOTA_ENFD) ? - seq_puts(m, "," MNTOPT_USRQUOTA) : - seq_puts(m, "," MNTOPT_UQUOTANOENF); - } - - if (mp->m_qflags & XFS_PQUOTA_ACCT) { - (mp->m_qflags & XFS_OQUOTA_ENFD) ? - seq_puts(m, "," MNTOPT_PRJQUOTA) : - seq_puts(m, "," MNTOPT_PQUOTANOENF); - } - - if (mp->m_qflags & XFS_GQUOTA_ACCT) { - (mp->m_qflags & XFS_OQUOTA_ENFD) ? - seq_puts(m, "," MNTOPT_GRPQUOTA) : - seq_puts(m, "," MNTOPT_GQUOTANOENF); - } - - if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) - seq_puts(m, "," MNTOPT_NOQUOTA); - - return bhv_next_vfs_showargs(BHV_NEXT(bhv), m); -} - -STATIC int -xfs_qm_mount( - struct bhv_desc *bhv, - struct xfs_mount_args *args, - struct cred *cr) -{ - struct bhv_vfs *vfsp = bhvtovfs(bhv); - struct xfs_mount *mp = XFS_VFSTOM(vfsp); - - if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA)) - xfs_qm_mount_quotainit(mp, args->flags); - return bhv_next_vfs_mount(BHV_NEXT(bhv), args, cr); -} - -/* - * Directory tree accounting is implemented using project quotas, where - * the project identifier is inherited from parent directories. - * A statvfs (df, etc.) of a directory that is using project quota should - * return a statvfs of the project, not the entire filesystem. - * This makes such trees appear as if they are filesystems in themselves. - */ -STATIC int -xfs_qm_statvfs( - struct bhv_desc *bhv, +STATIC void +xfs_fill_statvfs_from_dquot( bhv_statvfs_t *statp, - struct bhv_vnode *vnode) + xfs_disk_dquot_t *dp) { - xfs_mount_t *mp; - xfs_inode_t *ip; - xfs_dquot_t *dqp; - xfs_disk_dquot_t *dp; __uint64_t limit; - int error; - - error = bhv_next_vfs_statvfs(BHV_NEXT(bhv), statp, vnode); - if (error || !vnode) - return error; - - mp = xfs_vfstom(bhvtovfs(bhv)); - ip = xfs_vtoi(vnode); - - if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) - return 0; - if (!(mp->m_qflags & XFS_PQUOTA_ACCT)) - return 0; - if (!(mp->m_qflags & XFS_OQUOTA_ENFD)) - return 0; - - if (xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) - return 0; - dp = &dqp->q_core; limit = dp->d_blk_softlimit ? be64_to_cpu(dp->d_blk_softlimit) : @@ -236,37 +75,35 @@ xfs_qm_statvfs( (statp->f_files > be64_to_cpu(dp->d_icount)) ? (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; } - - xfs_qm_dqput(dqp); - return 0; } -STATIC int -xfs_qm_syncall( - struct bhv_desc *bhv, - int flags, - cred_t *credp) + +/* + * Directory tree accounting is implemented using project quotas, where + * the project identifier is inherited from parent directories. + * A statvfs (df, etc.) of a directory that is using project quota should + * return a statvfs of the project, not the entire filesystem. + * This makes such trees appear as if they are filesystems in themselves. + */ +STATIC void +xfs_qm_statvfs( + xfs_inode_t *ip, + bhv_statvfs_t *statp) { - struct bhv_vfs *vfsp = bhvtovfs(bhv); - struct xfs_mount *mp = XFS_VFSTOM(vfsp); - int error; + xfs_mount_t *mp = ip->i_mount; + xfs_dquot_t *dqp; - /* - * Get the Quota Manager to flush the dquots. - */ - if (XFS_IS_QUOTA_ON(mp)) { - if ((error = xfs_qm_sync(mp, flags))) { - /* - * If we got an IO error, we will be shutting down. - * So, there's nothing more for us to do here. - */ - ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp)); - if (XFS_FORCED_SHUTDOWN(mp)) { - return XFS_ERROR(error); - } - } + if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || + !((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) + return; + + if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { + xfs_disk_dquot_t *dp = &dqp->q_core; + + xfs_fill_statvfs_from_dquot(statp, dp); + xfs_qm_dqput(dqp); } - return bhv_next_vfs_sync(BHV_NEXT(bhv), flags, credp); } STATIC int @@ -281,7 +118,7 @@ xfs_qm_newmount( *quotaflags = 0; *needquotamount = B_FALSE; - quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && + quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); if (quotaondisk) { @@ -325,7 +162,7 @@ xfs_qm_newmount( * mounting, and get on with the boring life * without disk quotas. */ - xfs_qm_mount_quotas(mp, 0); + xfs_qm_mount_quotas(mp); } else { /* * Clear the quota flags, but remember them. This @@ -347,13 +184,12 @@ STATIC int xfs_qm_endmount( xfs_mount_t *mp, uint needquotamount, - uint quotaflags, - int mfsi_flags) + uint quotaflags) { if (needquotamount) { ASSERT(mp->m_qflags == 0); mp->m_qflags = quotaflags; - xfs_qm_mount_quotas(mp, mfsi_flags); + xfs_qm_mount_quotas(mp); } #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) @@ -384,7 +220,7 @@ xfs_qm_dqrele_null( } -STATIC struct xfs_qmops xfs_qmcore_xfs = { +struct xfs_qmops xfs_qmcore_xfs = { .xfs_qminit = xfs_qm_newmount, .xfs_qmdone = xfs_qm_unmount_quotadestroy, .xfs_qmmount = xfs_qm_endmount, @@ -398,36 +234,24 @@ STATIC struct xfs_qmops xfs_qmcore_xfs = { .xfs_dqvoprename = xfs_qm_vop_rename_dqattach, .xfs_dqvopchown = xfs_qm_vop_chown, .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, + .xfs_dqstatvfs = xfs_qm_statvfs, + .xfs_dqsync = xfs_qm_sync, + .xfs_quotactl = xfs_qm_quotactl, .xfs_dqtrxops = &xfs_trans_dquot_ops, }; - -struct bhv_module_vfsops xfs_qmops = { { - BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM), - .vfs_parseargs = xfs_qm_parseargs, - .vfs_showargs = xfs_qm_showargs, - .vfs_mount = xfs_qm_mount, - .vfs_statvfs = xfs_qm_statvfs, - .vfs_sync = xfs_qm_syncall, - .vfs_quotactl = xfs_qm_quotactl, }, -}; - +EXPORT_SYMBOL(xfs_qmcore_xfs); void __init xfs_qm_init(void) { - static char message[] __initdata = - KERN_INFO "SGI XFS Quota Management subsystem\n"; - - printk(message); + printk(KERN_INFO "SGI XFS Quota Management subsystem\n"); mutex_init(&xfs_Gqm_lock); - vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs); xfs_qm_init_procfs(); } void __exit xfs_qm_exit(void) { - vfs_bhv_clr_custom(&xfs_qmops); xfs_qm_cleanup_procfs(); if (qm_dqzone) kmem_zone_destroy(qm_dqzone); diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 6f858fb..709f5f5 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -43,8 +43,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h index a50ffab..5b964fc 100644 --- a/fs/xfs/quota/xfs_qm_stats.h +++ b/fs/xfs/quota/xfs_qm_stats.h @@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void); # define XQM_STATS_INC(count) do { } while (0) -static __inline void xfs_qm_init_procfs(void) { }; -static __inline void xfs_qm_cleanup_procfs(void) { }; +static inline void xfs_qm_init_procfs(void) { }; +static inline void xfs_qm_cleanup_procfs(void) { }; #endif diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index ed620c4..1a3b803 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -46,8 +46,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -83,18 +81,13 @@ STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, */ int xfs_qm_quotactl( - struct bhv_desc *bdp, + xfs_mount_t *mp, int cmd, int id, xfs_caddr_t addr) { - xfs_mount_t *mp; - bhv_vfs_t *vfsp; int error; - vfsp = bhvtovfs(bdp); - mp = XFS_VFSTOM(vfsp); - ASSERT(addr != NULL || cmd == Q_XQUOTASYNC); /* @@ -107,7 +100,7 @@ xfs_qm_quotactl( */ if (XFS_IS_QUOTA_ON(mp)) return XFS_ERROR(EINVAL); - if (vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); return (xfs_qm_scall_trunc_qfiles(mp, xfs_qm_import_qtype_flags(*(uint *)addr))); @@ -123,18 +116,18 @@ xfs_qm_quotactl( * QUOTAON - enabling quota enforcement. * Quota accounting must be turned on at mount time. */ - if (vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); return (xfs_qm_scall_quotaon(mp, xfs_qm_import_flags(*(uint *)addr))); case Q_XQUOTAOFF: - if (vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); break; case Q_XQUOTASYNC: - return (xfs_sync_inodes(mp, SYNC_DELWRI, 0, NULL)); + return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL)); default: break; @@ -145,7 +138,7 @@ xfs_qm_quotactl( switch (cmd) { case Q_XQUOTAOFF: - if (vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); error = xfs_qm_scall_quotaoff(mp, xfs_qm_import_flags(*(uint *)addr), @@ -166,19 +159,19 @@ xfs_qm_quotactl( break; case Q_XSETQLIM: - if (vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER, (fs_disk_quota_t *)addr); break; case Q_XSETGQLIM: - if (vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, (fs_disk_quota_t *)addr); break; case Q_XSETPQLIM: - if (vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, (fs_disk_quota_t *)addr); @@ -207,7 +200,6 @@ xfs_qm_scall_quotaoff( boolean_t force) { uint dqtype; - unsigned long s; int error; uint inactivate_flags; xfs_qoff_logitem_t *qoffstart; @@ -244,9 +236,9 @@ xfs_qm_scall_quotaoff( if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { mp->m_qflags &= ~(flags); - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = mp->m_qflags; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); /* XXX what to do if error ? Revert back to old vals incore ? */ @@ -287,9 +279,12 @@ xfs_qm_scall_quotaoff( /* * Write the LI_QUOTAOFF log record, and do SB changes atomically, - * and synchronously. + * and synchronously. If we fail to write, we should abort the + * operation as it cannot be recovered safely if we crash. */ - xfs_qm_log_quotaoff(mp, &qoffstart, flags); + error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); + if (error) + goto out_error; /* * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct @@ -345,7 +340,12 @@ xfs_qm_scall_quotaoff( * So, we have QUOTAOFF start and end logitems; the start * logitem won't get overwritten until the end logitem appears... */ - xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + if (error) { + /* We're screwed now. Shutdown is the only option. */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_error; + } /* * If quotas is completely disabled, close shop. @@ -362,13 +362,14 @@ xfs_qm_scall_quotaoff( * if we don't need them anymore. */ if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { - XFS_PURGE_INODE(XFS_QI_UQIP(mp)); + IRELE(XFS_QI_UQIP(mp)); XFS_QI_UQIP(mp) = NULL; } if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { - XFS_PURGE_INODE(XFS_QI_GQIP(mp)); + IRELE(XFS_QI_GQIP(mp)); XFS_QI_GQIP(mp) = NULL; } +out_error: mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); return (error); @@ -379,35 +380,34 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error; + int error = 0, error2 = 0; xfs_inode_t *qip; if (!capable(CAP_SYS_ADMIN)) return XFS_ERROR(EPERM); - error = 0; - if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) { + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); return XFS_ERROR(EINVAL); } if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); - if (! error) { - (void) xfs_truncate_file(mp, qip); - VN_RELE(XFS_ITOV(qip)); + if (!error) { + error = xfs_truncate_file(mp, qip); + IRELE(qip); } } if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && mp->m_sb.sb_gquotino != NULLFSINO) { - error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); - if (! error) { - (void) xfs_truncate_file(mp, qip); - VN_RELE(XFS_ITOV(qip)); + error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); + if (!error2) { + error2 = xfs_truncate_file(mp, qip); + IRELE(qip); } } - return (error); + return error ? error : error2; } @@ -422,7 +422,6 @@ xfs_qm_scall_quotaon( uint flags) { int error; - unsigned long s; uint qf; uint accflags; __int64_t sbflags; @@ -458,9 +457,7 @@ xfs_qm_scall_quotaon( || ((flags & XFS_PQUOTA_ACCT) == 0 && (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && - (flags & XFS_OQUOTA_ENFD)) - || - ((flags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ACCT) == 0 && (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && (flags & XFS_OQUOTA_ENFD))) { qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", @@ -477,10 +474,10 @@ xfs_qm_scall_quotaon( * Change sb_qflags on disk but not incore mp->qflags * if this is the root filesystem. */ - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); qf = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = qf | flags; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); /* * There's nothing to change if it's the same. @@ -533,7 +530,7 @@ xfs_qm_scall_getqstat( memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; - if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) { + if (!xfs_sb_version_hasquota(&mp->m_sb)) { out->qs_uquota.qfs_ino = NULLFSINO; out->qs_gquota.qfs_ino = NULLFSINO; return (0); @@ -563,13 +560,13 @@ xfs_qm_scall_getqstat( out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; if (tempuqip) - VN_RELE(XFS_ITOV(uip)); + IRELE(uip); } if (gip) { out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; if (tempgqip) - VN_RELE(XFS_ITOV(gip)); + IRELE(gip); } if (mp->m_quotainfo) { out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); @@ -737,12 +734,12 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT"); - xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (0); + return error; } STATIC int @@ -811,7 +808,7 @@ xfs_qm_log_quotaoff_end( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); return (error); } @@ -824,7 +821,6 @@ xfs_qm_log_quotaoff( { xfs_trans_t *tp; int error; - unsigned long s; xfs_qoff_logitem_t *qoffi=NULL; uint oldsbqflag=0; @@ -841,10 +837,10 @@ xfs_qm_log_quotaoff( qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); oldsbqflag = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, XFS_SB_QFLAGS); @@ -854,7 +850,7 @@ xfs_qm_log_quotaoff( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); error0: if (error) { @@ -863,9 +859,9 @@ error0: * No one else is modifying sb_qflags, so this is OK. * We still hold the quotaofflock. */ - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = oldsbqflag; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); } *qoffstartp = qoffi; return (error); @@ -913,14 +909,19 @@ xfs_qm_export_dquot( * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if (! XFS_IS_QUOTA_ENFORCED(mp)) { + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) || + (!XFS_IS_OQUOTA_ENFORCED(mp) && + (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { dst->d_btimer = 0; dst->d_itimer = 0; dst->d_rtbtimer = 0; } #ifdef DEBUG - if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) { + if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) || + (XFS_IS_OQUOTA_ENFORCED(mp) && + (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) && + dst->d_id != 0) { if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); @@ -1033,7 +1034,7 @@ xfs_qm_dqrele_all_inodes( { xfs_inode_t *ip, *topino; uint ireclaims; - bhv_vnode_t *vp; + struct inode *vp; boolean_t vnode_refd; ASSERT(mp->m_quotainfo); @@ -1058,7 +1059,7 @@ again: ip = ip->i_mnext; continue; } - vp = XFS_ITOV_NULL(ip); + vp = VFS_I(ip); if (!vp) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); @@ -1102,7 +1103,7 @@ again: * inactive code in hell. */ if (vnode_refd) - VN_RELE(vp); + IRELE(ip); XFS_MOUNT_ILOCK(mp); /* * If an inode was inserted or removed, we gotta @@ -1365,12 +1366,6 @@ xfs_qm_internalqcheck_adjust( return (error); } - if (ip->i_d.di_mode == 0) { - xfs_iput_new(ip, lock_flags); - *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(ENOENT); - } - /* * This inode can have blocks after eof which can get released * when we send it to inactive. Since we don't check the dquot @@ -1454,14 +1449,14 @@ xfs_qm_internalqcheck( for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { xfs_dqtest_cmp(d); e = (xfs_dqtest_t *) d->HL_NEXT; - kmem_free(d, sizeof(xfs_dqtest_t)); + kmem_free(d); d = e; } h1 = &qmtest_gdqtab[i]; for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { xfs_dqtest_cmp(d); e = (xfs_dqtest_t *) d->HL_NEXT; - kmem_free(d, sizeof(xfs_dqtest_t)); + kmem_free(d); d = e; } } @@ -1472,8 +1467,8 @@ xfs_qm_internalqcheck( } else { cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); } - kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); - kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); + kmem_free(qmtest_udqtab); + kmem_free(qmtest_gdqtab); mutex_unlock(&qcheck_lock); return (qmtest_nfails); } diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index b7ddd04..c4fcea6 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -27,11 +27,6 @@ /* Number of dquots that fit in to a dquot block */ #define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) -#define XFS_ISLOCKED_INODE(ip) (ismrlocked(&(ip)->i_lock, \ - MR_UPDATE | MR_ACCESS) != 0) -#define XFS_ISLOCKED_INODE_EXCL(ip) (ismrlocked(&(ip)->i_lock, \ - MR_UPDATE) != 0) - #define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) #define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) @@ -75,7 +70,6 @@ static inline int XQMISLCKD(struct xfs_dqhash *h) #define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist)) #define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist)) -#define XFS_QM_IS_FREELIST_LOCKED(qm) XQMISLCKD(&((qm)->qm_dqfreelist)) /* * Hash into a bucket in the dquot hash table, based on <mp, id>. @@ -164,12 +158,8 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ #define XFS_IS_SUSER_DQUOT(dqp) \ (!((dqp)->q_core.d_id)) -#define XFS_PURGE_INODE(ip) \ - IRELE(ip); - #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) -#define DQFLAGTO_DIRTYSTR(d) (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY") #endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 0242e96..9961138 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -43,8 +43,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -423,13 +421,13 @@ xfs_trans_apply_dquot_deltas( (xfs_qcnt_t) -qtrx->qt_icount_delta); #endif if (totalbdelta) - be64_add(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); if (qtrx->qt_icount_delta) - be64_add(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); if (totalrtbdelta) - be64_add(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); /* * Get any default limits in use. @@ -658,7 +656,9 @@ xfs_trans_dqresv( if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_core.d_id && - XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) { + ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || + (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && + (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { #ifdef QUOTADEBUG cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld" " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit); @@ -834,7 +834,7 @@ xfs_trans_reserve_quota_nblks( ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == XFS_TRANS_DQ_RES_RTBLKS || diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 36fbecc..c27abef 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -15,13 +15,10 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <xfs.h> #include "debug.h" -#include "spin.h" -#include <asm/page.h> -#include <linux/sched.h> -#include <linux/kernel.h> -static char message[256]; /* keep it off the stack */ +static char message[1024]; /* keep it off the stack */ static DEFINE_SPINLOCK(xfs_err_lock); /* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ @@ -46,15 +43,15 @@ cmn_err(register int level, char *fmt, ...) spin_lock_irqsave(&xfs_err_lock,flags); va_start(ap, fmt); if (*fmt == '!') fp++; - len = vsprintf(message, fp, ap); - if (level != CE_DEBUG && message[len-1] != '\n') - strcat(message, "\n"); - printk("%s%s", err_level[level], message); + len = vsnprintf(message, sizeof(message), fp, ap); + if (len >= sizeof(message)) + len = sizeof(message) - 1; + if (message[len-1] == '\n') + message[len-1] = 0; + printk("%s%s\n", err_level[level], message); va_end(ap); spin_unlock_irqrestore(&xfs_err_lock,flags); - - if (level == CE_PANIC) - BUG(); + BUG_ON(level == CE_PANIC); } void @@ -67,13 +64,14 @@ icmn_err(register int level, char *fmt, va_list ap) if(level > XFS_MAX_ERR_LEVEL) level = XFS_MAX_ERR_LEVEL; spin_lock_irqsave(&xfs_err_lock,flags); - len = vsprintf(message, fmt, ap); - if (level != CE_DEBUG && message[len-1] != '\n') - strcat(message, "\n"); + len = vsnprintf(message, sizeof(message), fmt, ap); + if (len >= sizeof(message)) + len = sizeof(message) - 1; + if (message[len-1] == '\n') + message[len-1] = 0; + printk("%s%s\n", err_level[level], message); spin_unlock_irqrestore(&xfs_err_lock,flags); - printk("%s%s", err_level[level], message); - if (level == CE_PANIC) - BUG(); + BUG_ON(level == CE_PANIC); } void @@ -83,19 +81,8 @@ assfail(char *expr, char *file, int line) BUG(); } -#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM)) -unsigned long random(void) +void +xfs_hex_dump(void *p, int length) { - static unsigned long RandomValue = 1; - /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */ - register long rv = RandomValue; - register long lo; - register long hi; - - hi = rv / 127773; - lo = rv % 127773; - rv = 16807 * lo - 2836 * hi; - if (rv <= 0) rv += 2147483647; - return RandomValue = rv; + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } -#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */ diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index 4f54dca..75845f9 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -34,17 +34,41 @@ extern void cmn_err(int, char *, ...) extern void assfail(char *expr, char *f, int l); #define ASSERT_ALWAYS(expr) \ - (unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifndef DEBUG -# define ASSERT(expr) ((void)0) -#else -# define ASSERT(expr) ASSERT_ALWAYS(expr) -extern unsigned long random(void); +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#ifndef STATIC_INLINE +# define STATIC_INLINE static inline #endif +#else /* DEBUG */ + +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + #ifndef STATIC -# define STATIC static +# define STATIC noinline #endif +/* + * We stop inlining of inline functions in debug mode. + * Unfortunately, this means static inline in header files + * get multiple definitions, so they need to remain static. + * This then gives tonnes of warnings about unused but defined + * functions, so we need to add the unused attribute to prevent + * these spurious warnings. + */ +#ifndef STATIC_INLINE +# define STATIC_INLINE static __attribute__ ((unused)) noinline +#endif + +#endif /* DEBUG */ + + #endif /* __XFS_SUPPORT_DEBUG_H__ */ diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c index addf5a7..a34ef05 100644 --- a/fs/xfs/support/ktrace.c +++ b/fs/xfs/support/ktrace.c @@ -21,10 +21,10 @@ static kmem_zone_t *ktrace_hdr_zone; static kmem_zone_t *ktrace_ent_zone; static int ktrace_zentries; -void +void __init ktrace_init(int zentries) { - ktrace_zentries = zentries; + ktrace_zentries = roundup_pow_of_two(zentries); ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t), "ktrace_hdr"); @@ -36,7 +36,7 @@ ktrace_init(int zentries) ASSERT(ktrace_ent_zone); } -void +void __exit ktrace_uninit(void) { kmem_zone_destroy(ktrace_hdr_zone); @@ -47,13 +47,16 @@ ktrace_uninit(void) * ktrace_alloc() * * Allocate a ktrace header and enough buffering for the given - * number of entries. + * number of entries. Round the number of entries up to a + * power of 2 so we can do fast masking to get the index from + * the atomic index counter. */ ktrace_t * ktrace_alloc(int nentries, unsigned int __nocast sleep) { ktrace_t *ktp; ktrace_entry_t *ktep; + int entries; ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep); @@ -70,12 +73,13 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) /* * Special treatment for buffers with the ktrace_zentries entries */ - if (nentries == ktrace_zentries) { + entries = roundup_pow_of_two(nentries); + if (entries == ktrace_zentries) { ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone, sleep); } else { - ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), - sleep); + ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)), + sleep | KM_LARGE); } if (ktep == NULL) { @@ -85,16 +89,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) if (sleep & KM_SLEEP) panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); - kmem_free(ktp, sizeof(*ktp)); + kmem_free(ktp); return NULL; } - spinlock_init(&(ktp->kt_lock), "kt_lock"); - ktp->kt_entries = ktep; - ktp->kt_nentries = nentries; - ktp->kt_index = 0; + ktp->kt_nentries = entries; + ASSERT(is_power_of_2(entries)); + ktp->kt_index_mask = entries - 1; + atomic_set(&ktp->kt_index, 0); ktp->kt_rollover = 0; return ktp; } @@ -114,8 +118,6 @@ ktrace_free(ktrace_t *ktp) if (ktp == (ktrace_t *)NULL) return; - spinlock_destroy(&ktp->kt_lock); - /* * Special treatment for the Vnode trace buffer. */ @@ -124,7 +126,7 @@ ktrace_free(ktrace_t *ktp) } else { entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t)); - kmem_free(ktp->kt_entries, entries_size); + kmem_free(ktp->kt_entries); } kmem_zone_free(ktrace_hdr_zone, ktp); @@ -155,8 +157,6 @@ ktrace_enter( void *val14, void *val15) { - static DEFINE_SPINLOCK(wrap_lock); - unsigned long flags; int index; ktrace_entry_t *ktep; @@ -165,12 +165,8 @@ ktrace_enter( /* * Grab an entry by pushing the index up to the next one. */ - spin_lock_irqsave(&wrap_lock, flags); - index = ktp->kt_index; - if (++ktp->kt_index == ktp->kt_nentries) - ktp->kt_index = 0; - spin_unlock_irqrestore(&wrap_lock, flags); - + index = atomic_add_return(1, &ktp->kt_index); + index = (index - 1) & ktp->kt_index_mask; if (!ktp->kt_rollover && index == ktp->kt_nentries - 1) ktp->kt_rollover = 1; @@ -203,11 +199,12 @@ int ktrace_nentries( ktrace_t *ktp) { - if (ktp == NULL) { + int index; + if (ktp == NULL) return 0; - } - return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index); + index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask; + return (ktp->kt_rollover ? ktp->kt_nentries : index); } /* @@ -232,7 +229,7 @@ ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp) int nentries; if (ktp->kt_rollover) - index = ktp->kt_index; + index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask; else index = 0; diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h index 0d73216..741d694 100644 --- a/fs/xfs/support/ktrace.h +++ b/fs/xfs/support/ktrace.h @@ -18,8 +18,6 @@ #ifndef __XFS_SUPPORT_KTRACE_H__ #define __XFS_SUPPORT_KTRACE_H__ -#include <spin.h> - /* * Trace buffer entry structure. */ @@ -31,9 +29,9 @@ typedef struct ktrace_entry { * Trace buffer header structure. */ typedef struct ktrace { - lock_t kt_lock; /* mutex to guard counters */ int kt_nentries; /* number of entries in trace buf */ - int kt_index; /* current index in entries */ + atomic_t kt_index; /* current index in entries */ + unsigned int kt_index_mask; int kt_rollover; ktrace_entry_t *kt_entries; /* buffer of entries */ } ktrace_t; diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c deleted file mode 100644 index caefa17..0000000 --- a/fs/xfs/support/move.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <xfs.h> - -/* Read from kernel buffer at src to user/kernel buffer defined - * by the uio structure. Advance the pointer in the uio struct - * as we go. - */ -int -uio_read(caddr_t src, size_t len, struct uio *uio) -{ - size_t count; - - if (!len || !uio->uio_resid) - return 0; - - count = uio->uio_iov->iov_len; - if (!count) - return 0; - if (count > len) - count = len; - - if (uio->uio_segflg == UIO_USERSPACE) { - if (copy_to_user(uio->uio_iov->iov_base, src, count)) - return EFAULT; - } else { - ASSERT(uio->uio_segflg == UIO_SYSSPACE); - memcpy(uio->uio_iov->iov_base, src, count); - } - - uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count); - uio->uio_iov->iov_len -= count; - uio->uio_offset += count; - uio->uio_resid -= count; - return 0; -} diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h deleted file mode 100644 index 97a2498..0000000 --- a/fs/xfs/support/move.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Portions Copyright (c) 1982, 1986, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -#ifndef __XFS_SUPPORT_MOVE_H__ -#define __XFS_SUPPORT_MOVE_H__ - -#include <linux/uio.h> -#include <asm/uaccess.h> - -/* Segment flag values. */ -enum uio_seg { - UIO_USERSPACE, /* from user data space */ - UIO_SYSSPACE, /* from system space */ -}; - -struct uio { - struct iovec *uio_iov; /* pointer to array of iovecs */ - int uio_iovcnt; /* number of iovecs in array */ - xfs_off_t uio_offset; /* offset in file this uio corresponds to */ - int uio_resid; /* residual i/o count */ - enum uio_seg uio_segflg; /* see above */ -}; - -typedef struct uio uio_t; -typedef struct iovec iovec_t; - -extern int uio_read (caddr_t, size_t, uio_t *); - -#endif /* __XFS_SUPPORT_MOVE_H__ */ diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c index e157015..5830c04 100644 --- a/fs/xfs/support/uuid.c +++ b/fs/xfs/support/uuid.c @@ -17,7 +17,7 @@ */ #include <xfs.h> -static mutex_t uuid_monitor; +static DEFINE_MUTEX(uuid_monitor); static int uuid_table_size; static uuid_t *uuid_table; @@ -132,9 +132,3 @@ uuid_table_remove(uuid_t *uuid) ASSERT(i < uuid_table_size); mutex_unlock(&uuid_monitor); } - -void -uuid_init(void) -{ - mutex_init(&uuid_monitor); -} diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h index b6f5922..cff5b60 100644 --- a/fs/xfs/support/uuid.h +++ b/fs/xfs/support/uuid.h @@ -22,7 +22,6 @@ typedef struct { unsigned char __u_bits[16]; } uuid_t; -extern void uuid_init(void); extern void uuid_create_nil(uuid_t *uuid); extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 1a48dbb..540e4c9 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -17,5 +17,29 @@ */ #ifndef __XFS_H__ #define __XFS_H__ + +#ifdef CONFIG_XFS_DEBUG +#define STATIC +#define DEBUG 1 +#define XFS_BUF_LOCK_TRACKING 1 +/* #define QUOTADEBUG 1 */ +#endif + +#ifdef CONFIG_XFS_TRACE +#define XFS_ALLOC_TRACE 1 +#define XFS_ATTR_TRACE 1 +#define XFS_BLI_TRACE 1 +#define XFS_BMAP_TRACE 1 +#define XFS_BMBT_TRACE 1 +#define XFS_DIR2_TRACE 1 +#define XFS_DQUOT_TRACE 1 +#define XFS_ILOCK_TRACE 1 +#define XFS_LOG_TRACE 1 +#define XFS_RW_TRACE 1 +#define XFS_BUF_TRACE 1 +#define XFS_INODE_TRACE 1 +#define XFS_FILESTREAMS_TRACE 1 +#endif + #include <linux-2.6/xfs_linux.h> #endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4b0cb47..b2f639a 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -31,21 +31,21 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_acl.h" -#include "xfs_mac.h" #include "xfs_attr.h" +#include "xfs_vnodeops.h" #include <linux/capability.h> #include <linux/posix_acl_xattr.h> -STATIC int xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *); +STATIC int xfs_acl_setmode(struct inode *, xfs_acl_t *, int *); STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *); STATIC void xfs_acl_get_endian(xfs_acl_t *); STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *); STATIC int xfs_acl_invalid(xfs_acl_t *); STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *); -STATIC void xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *); -STATIC void xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *); -STATIC int xfs_acl_allow_set(bhv_vnode_t *, int); +STATIC void xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *); +STATIC void xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *); +STATIC int xfs_acl_allow_set(struct inode *, int); kmem_zone_t *xfs_acl_zone; @@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone; */ int xfs_acl_vhasacl_access( - bhv_vnode_t *vp) + struct inode *vp) { int error; @@ -68,11 +68,11 @@ xfs_acl_vhasacl_access( */ int xfs_acl_vhasacl_default( - bhv_vnode_t *vp) + struct inode *vp) { int error; - if (!VN_ISDIR(vp)) + if (!S_ISDIR(vp->i_mode)) return 0; xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error); return (error == 0); @@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr( int xfs_acl_vget( - bhv_vnode_t *vp, + struct inode *vp, void *acl, size_t size, int kind) @@ -217,7 +217,6 @@ xfs_acl_vget( posix_acl_xattr_header *ext_acl = acl; int flags = 0; - VN_HOLD(vp); if(size) { if (!(_ACL_ALLOC(xfs_acl))) { error = ENOMEM; @@ -238,19 +237,11 @@ xfs_acl_vget( error = EINVAL; goto out; } - if (kind == _ACL_TYPE_ACCESS) { - bhv_vattr_t va; - - va.va_mask = XFS_AT_MODE; - error = bhv_vop_getattr(vp, &va, 0, sys_cred); - if (error) - goto out; - xfs_acl_sync_mode(va.va_mode, xfs_acl); - } + if (kind == _ACL_TYPE_ACCESS) + xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl); error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size); } out: - VN_RELE(vp); if(xfs_acl) _ACL_FREE(xfs_acl); return -error; @@ -258,27 +249,26 @@ out: int xfs_acl_vremove( - bhv_vnode_t *vp, + struct inode *vp, int kind) { int error; - VN_HOLD(vp); error = xfs_acl_allow_set(vp, kind); if (!error) { - error = bhv_vop_attr_remove(vp, kind == _ACL_TYPE_DEFAULT? + error = xfs_attr_remove(XFS_I(vp), + kind == _ACL_TYPE_DEFAULT? SGI_ACL_DEFAULT: SGI_ACL_FILE, - ATTR_ROOT, sys_cred); + ATTR_ROOT); if (error == ENOATTR) error = 0; /* 'scool */ } - VN_RELE(vp); return -error; } int xfs_acl_vset( - bhv_vnode_t *vp, + struct inode *vp, void *acl, size_t size, int kind) @@ -304,14 +294,14 @@ xfs_acl_vset( return 0; } - VN_HOLD(vp); error = xfs_acl_allow_set(vp, kind); - if (error) - goto out; /* Incoming ACL exists, set file mode based on its value */ - if (kind == _ACL_TYPE_ACCESS) - xfs_acl_setmode(vp, xfs_acl, &basicperms); + if (!error && kind == _ACL_TYPE_ACCESS) + error = xfs_acl_setmode(vp, xfs_acl, &basicperms); + + if (error) + goto out; /* * If we have more than std unix permissions, set up the actual attr. @@ -322,11 +312,10 @@ xfs_acl_vset( if (!basicperms) { xfs_acl_set_attr(vp, xfs_acl, kind, &error); } else { - xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); + error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); } out: - VN_RELE(vp); _ACL_FREE(xfs_acl); return -error; } @@ -339,14 +328,14 @@ xfs_acl_iaccess( { xfs_acl_t *acl; int rval; + struct xfs_name acl_name = {SGI_ACL_FILE, SGI_ACL_FILE_SIZE}; if (!(_ACL_ALLOC(acl))) return -1; /* If the file has no ACL return -1. */ rval = sizeof(xfs_acl_t); - if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE, - (char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) { + if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) { _ACL_FREE(acl); return -1; } @@ -368,50 +357,17 @@ xfs_acl_iaccess( STATIC int xfs_acl_allow_set( - bhv_vnode_t *vp, + struct inode *vp, int kind) { - bhv_vattr_t va; - int error; - - if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) + if (vp->i_flags & (S_IMMUTABLE|S_APPEND)) return EPERM; - if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp)) + if (kind == _ACL_TYPE_DEFAULT && !S_ISDIR(vp->i_mode)) return ENOTDIR; - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + if (vp->i_sb->s_flags & MS_RDONLY) return EROFS; - va.va_mask = XFS_AT_UID; - error = bhv_vop_getattr(vp, &va, 0, NULL); - if (error) - return error; - if (va.va_uid != current->fsuid && !capable(CAP_FOWNER)) + if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER)) return EPERM; - return error; -} - -/* - * The access control process to determine the access permission: - * if uid == file owner id, use the file owner bits. - * if gid == file owner group id, use the file group bits. - * scan ACL for a matching user or group, and use matched entry - * permission. Use total permissions of all matching group entries, - * until all acl entries are exhausted. The final permission produced - * by matching acl entry or entries needs to be & with group permission. - * if not owner, owning group, or matching entry in ACL, use file - * other bits. - */ -STATIC int -xfs_acl_capability_check( - mode_t mode, - cred_t *cr) -{ - if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH)) - return EACCES; - if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) - return EACCES; - if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) - return EACCES; - return 0; } @@ -436,7 +392,6 @@ xfs_acl_access( matched.ae_tag = 0; /* Invalid type */ matched.ae_perm = 0; - md >>= 6; /* Normalize the bits for comparison */ for (i = 0; i < fap->acl_cnt; i++) { /* @@ -518,7 +473,8 @@ xfs_acl_access( break; } - return xfs_acl_capability_check(md, cr); + /* EACCES tells generic_permission to check for capability overrides */ + return EACCES; } /* @@ -604,7 +560,7 @@ xfs_acl_get_endian( */ STATIC void xfs_acl_get_attr( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *aclp, int kind, int flags, @@ -614,9 +570,10 @@ xfs_acl_get_attr( ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1); flags |= ATTR_ROOT; - *error = bhv_vop_attr_get(vp, kind == _ACL_TYPE_ACCESS ? + *error = xfs_attr_get(XFS_I(vp), + kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT, - (char *)aclp, &len, flags, sys_cred); + (char *)aclp, &len, flags); if (*error || (flags & ATTR_KERNOVAL)) return; xfs_acl_get_endian(aclp); @@ -627,7 +584,7 @@ xfs_acl_get_attr( */ STATIC void xfs_acl_set_attr( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *aclp, int kind, int *error) @@ -652,19 +609,19 @@ xfs_acl_set_attr( INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm); } INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); - *error = bhv_vop_attr_set(vp, kind == _ACL_TYPE_ACCESS ? + *error = xfs_attr_set(XFS_I(vp), + kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT, - (char *)newacl, len, ATTR_ROOT, sys_cred); + (char *)newacl, len, ATTR_ROOT); _ACL_FREE(newacl); } int xfs_acl_vtoacl( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *access_acl, xfs_acl_t *default_acl) { - bhv_vattr_t va; int error = 0; if (access_acl) { @@ -673,16 +630,10 @@ xfs_acl_vtoacl( * be obtained for some reason, invalidate the access ACL. */ xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error); - if (!error) { - /* Got the ACL, need the mode... */ - va.va_mask = XFS_AT_MODE; - error = bhv_vop_getattr(vp, &va, 0, sys_cred); - } - if (error) access_acl->acl_cnt = XFS_ACL_NOT_PRESENT; else /* We have a good ACL and the file mode, synchronize. */ - xfs_acl_sync_mode(va.va_mode, access_acl); + xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl); } if (default_acl) { @@ -699,8 +650,8 @@ xfs_acl_vtoacl( */ int xfs_acl_inherit( - bhv_vnode_t *vp, - bhv_vattr_t *vap, + struct inode *vp, + mode_t mode, xfs_acl_t *pdaclp) { xfs_acl_t *cacl; @@ -728,8 +679,10 @@ xfs_acl_inherit( return ENOMEM; memcpy(cacl, pdaclp, sizeof(xfs_acl_t)); - xfs_acl_filter_mode(vap->va_mode, cacl); - xfs_acl_setmode(vp, cacl, &basicperms); + xfs_acl_filter_mode(mode, cacl); + error = xfs_acl_setmode(vp, cacl, &basicperms); + if (error) + goto out_error; /* * Set the Default and Access ACL on the file. The mode is already @@ -738,10 +691,11 @@ xfs_acl_inherit( * If the new file is a directory, its default ACL is a copy of * the containing directory's default ACL. */ - if (VN_ISDIR(vp)) + if (S_ISDIR(vp->i_mode)) xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error); if (!error && !basicperms) xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error); +out_error: _ACL_FREE(cacl); return error; } @@ -755,14 +709,14 @@ xfs_acl_inherit( */ STATIC int xfs_acl_setmode( - bhv_vnode_t *vp, + struct inode *vp, xfs_acl_t *acl, int *basicperms) { - bhv_vattr_t va; + struct iattr iattr; xfs_acl_entry_t *ap; xfs_acl_entry_t *gap = NULL; - int i, error, nomask = 1; + int i, nomask = 1; *basicperms = 1; @@ -773,29 +727,25 @@ xfs_acl_setmode( * Copy the u::, g::, o::, and m:: bits from the ACL into the * mode. The m:: bits take precedence over the g:: bits. */ - va.va_mask = XFS_AT_MODE; - error = bhv_vop_getattr(vp, &va, 0, sys_cred); - if (error) - return error; - - va.va_mask = XFS_AT_MODE; - va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); + iattr.ia_valid = ATTR_MODE; + iattr.ia_mode = XFS_I(vp)->i_d.di_mode; + iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); ap = acl->acl_entry; for (i = 0; i < acl->acl_cnt; ++i) { switch (ap->ae_tag) { case ACL_USER_OBJ: - va.va_mode |= ap->ae_perm << 6; + iattr.ia_mode |= ap->ae_perm << 6; break; case ACL_GROUP_OBJ: gap = ap; break; case ACL_MASK: /* more than just standard modes */ nomask = 0; - va.va_mode |= ap->ae_perm << 3; + iattr.ia_mode |= ap->ae_perm << 3; *basicperms = 0; break; case ACL_OTHER: - va.va_mode |= ap->ae_perm; + iattr.ia_mode |= ap->ae_perm; break; default: /* more than just standard modes */ *basicperms = 0; @@ -806,9 +756,9 @@ xfs_acl_setmode( /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */ if (gap && nomask) - va.va_mode |= gap->ae_perm << 3; + iattr.ia_mode |= gap->ae_perm << 3; - return bhv_vop_setattr(vp, &va, 0, sys_cred); + return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred); } /* diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index f853cf1..a4e293b 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -46,11 +46,12 @@ typedef struct xfs_acl { #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) +#define _ACL_TYPE_ACCESS 1 +#define _ACL_TYPE_DEFAULT 2 #ifdef CONFIG_XFS_POSIX_ACL struct vattr; -struct bhv_vnode; struct xfs_inode; extern struct kmem_zone *xfs_acl_zone; @@ -58,25 +59,22 @@ extern struct kmem_zone *xfs_acl_zone; (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name)) #define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone) -extern int xfs_acl_inherit(struct bhv_vnode *, struct bhv_vattr *, xfs_acl_t *); +extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *); extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *); -extern int xfs_acl_vtoacl(struct bhv_vnode *, xfs_acl_t *, xfs_acl_t *); -extern int xfs_acl_vhasacl_access(struct bhv_vnode *); -extern int xfs_acl_vhasacl_default(struct bhv_vnode *); -extern int xfs_acl_vset(struct bhv_vnode *, void *, size_t, int); -extern int xfs_acl_vget(struct bhv_vnode *, void *, size_t, int); -extern int xfs_acl_vremove(struct bhv_vnode *, int); +extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *); +extern int xfs_acl_vhasacl_access(struct inode *); +extern int xfs_acl_vhasacl_default(struct inode *); +extern int xfs_acl_vset(struct inode *, void *, size_t, int); +extern int xfs_acl_vget(struct inode *, void *, size_t, int); +extern int xfs_acl_vremove(struct inode *, int); -#define _ACL_TYPE_ACCESS 1 -#define _ACL_TYPE_DEFAULT 2 #define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) -#define _ACL_INHERIT(c,v,d) (xfs_acl_inherit(c,v,d)) +#define _ACL_INHERIT(c,m,d) (xfs_acl_inherit(c,m,d)) #define _ACL_GET_ACCESS(pv,pa) (xfs_acl_vtoacl(pv,pa,NULL) == 0) #define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0) #define _ACL_ACCESS_EXISTS xfs_acl_vhasacl_access #define _ACL_DEFAULT_EXISTS xfs_acl_vhasacl_default -#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1) #define _ACL_ALLOC(a) ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP)) #define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0) @@ -91,12 +89,11 @@ extern int xfs_acl_vremove(struct bhv_vnode *, int); #define xfs_acl_vhasacl_default(v) (0) #define _ACL_ALLOC(a) (1) /* successfully allocate nothing */ #define _ACL_FREE(a) ((void)0) -#define _ACL_INHERIT(c,v,d) (0) +#define _ACL_INHERIT(c,m,d) (0) #define _ACL_GET_ACCESS(pv,pa) (0) #define _ACL_GET_DEFAULT(pv,pd) (0) #define _ACL_ACCESS_EXISTS (NULL) #define _ACL_DEFAULT_EXISTS (NULL) -#define _ACL_XFS_IACCESS(i,m,c) (-1) #endif #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index dc2361d..61b292a 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -68,6 +68,7 @@ typedef struct xfs_agf { __be32 agf_flcount; /* count of blocks in freelist */ __be32 agf_freeblks; /* total free blocks */ __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ } xfs_agf_t; #define XFS_AGF_MAGICNUM 0x00000001 @@ -81,7 +82,8 @@ typedef struct xfs_agf { #define XFS_AGF_FLCOUNT 0x00000100 #define XFS_AGF_FREEBLKS 0x00000200 #define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_NUM_BITS 11 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_NUM_BITS 12 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) /* disk block (xfs_daddr_t) in the AG */ @@ -150,7 +152,7 @@ typedef struct xfs_agi { #define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) typedef struct xfs_agfl { - xfs_agblock_t agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ + __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ } xfs_agfl_t; /* @@ -186,12 +188,19 @@ typedef struct xfs_perag __uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + int pagb_count; /* pagb slots in use */ #ifdef __KERNEL__ - lock_t pagb_lock; /* lock for pagb_list */ + spinlock_t pagb_lock; /* lock for pagb_list */ #endif - int pagb_count; /* pagb slots in use */ xfs_perag_busy_t *pagb_list; /* unstable blocks */ + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + int pag_ici_init; /* incore inode cache initialised */ + rwlock_t pag_ici_lock; /* incore inode lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ } xfs_perag_t; #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index d2bbcd8..1956f83 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -45,7 +45,7 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC int +STATIC void xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, @@ -55,24 +55,24 @@ xfs_alloc_search_busy(xfs_trans_t *tp, ktrace_t *xfs_alloc_trace_buf; #define TRACE_ALLOC(s,a) \ - xfs_alloc_trace_alloc(fname, s, a, __LINE__) + xfs_alloc_trace_alloc(__func__, s, a, __LINE__) #define TRACE_FREE(s,a,b,x,f) \ - xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__) + xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__) #define TRACE_MODAGF(s,a,f) \ - xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__) -#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) -#define TRACE_UNBUSY(fname,s,ag,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) -#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) + xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__) +#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) +#define TRACE_UNBUSY(__func__,s,ag,sl,tp) \ + xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) +#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp) \ + xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) #else #define TRACE_ALLOC(s,a) #define TRACE_FREE(s,a,b,x,f) #define TRACE_MODAGF(s,a,f) #define TRACE_BUSY(s,a,ag,agb,l,sl,tp) #define TRACE_UNBUSY(fname,s,ag,sl,tp) -#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) +#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp) #endif /* XFS_ALLOC_TRACE */ /* @@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, * Compute aligned version of the found extent. * Takes alignment and min length into account. */ -STATIC int /* success (>= minlen) */ +STATIC void xfs_alloc_compute_aligned( xfs_agblock_t foundbno, /* starting block in found extent */ xfs_extlen_t foundlen, /* length in found extent */ @@ -116,7 +116,6 @@ xfs_alloc_compute_aligned( } *resbno = bno; *reslen = len; - return len >= minlen; } /* @@ -420,7 +419,7 @@ xfs_alloc_read_agfl( */ STATIC void xfs_alloc_trace_alloc( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_alloc_arg_t *args, /* allocation argument structure */ int line) /* source line number */ @@ -453,7 +452,7 @@ xfs_alloc_trace_alloc( */ STATIC void xfs_alloc_trace_free( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_mount_t *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ @@ -479,7 +478,7 @@ xfs_alloc_trace_free( */ STATIC void xfs_alloc_trace_modagf( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_mount_t *mp, /* file system mount point */ xfs_agf_t *agf, /* new agf value */ @@ -507,7 +506,7 @@ xfs_alloc_trace_modagf( STATIC void xfs_alloc_trace_busy( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_mount_t *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ @@ -549,9 +548,6 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent"; -#endif ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); @@ -595,7 +591,7 @@ xfs_alloc_ag_vextent( if (!(args->wasfromfl)) { agf = XFS_BUF_TO_AGF(args->agbp); - be32_add(&agf->agf_freeblks, -(args->len)); + be32_add_cpu(&agf->agf_freeblks, -(args->len)); xfs_trans_agblocks_delta(args->tp, -((long)(args->len))); args->pag->pagf_freeblks -= args->len; @@ -635,9 +631,6 @@ xfs_alloc_ag_vextent_exact( xfs_agblock_t fbno; /* start block of found extent */ xfs_agblock_t fend; /* end block of found extent */ xfs_extlen_t flen; /* length of found extent */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_exact"; -#endif int i; /* success/failure of operation */ xfs_agblock_t maxend; /* end of maximal extent */ xfs_agblock_t minend; /* end of minimal extent */ @@ -737,9 +730,6 @@ xfs_alloc_ag_vextent_near( xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_near"; -#endif xfs_agblock_t gtbno; /* start bno of right side entry */ xfs_agblock_t gtbnoa; /* aligned ... */ xfs_extlen_t gtdiff; /* difference to right side entry */ @@ -764,7 +754,7 @@ xfs_alloc_ag_vextent_near( */ int dofirst; /* set to do first algorithm */ - dofirst = random() & 1; + dofirst = random32() & 1; #endif /* * Get a cursor for the by-size btree. @@ -846,9 +836,9 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (!xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena)) + xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, + args->minlen, <bnoa, <lena); + if (ltlena < args->minlen) continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); @@ -967,9 +957,9 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena)) + xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, + args->minlen, <bnoa, <lena); + if (ltlena >= args->minlen) break; if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) goto error0; @@ -983,9 +973,9 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (xfs_alloc_compute_aligned(gtbno, gtlen, - args->alignment, args->minlen, - >bnoa, >lena)) + xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment, + args->minlen, >bnoa, >lena); + if (gtlena >= args->minlen) break; if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) goto error0; @@ -1270,9 +1260,6 @@ xfs_alloc_ag_vextent_size( int error; /* error result */ xfs_agblock_t fbno; /* start of found freespace */ xfs_extlen_t flen; /* length of found freespace */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_size"; -#endif int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ @@ -1427,9 +1414,6 @@ xfs_alloc_ag_vextent_small( int error; xfs_agblock_t fbno; xfs_extlen_t flen; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_small"; -#endif int i; if ((error = xfs_alloc_decrement(ccur, 0, &i))) @@ -1447,7 +1431,8 @@ xfs_alloc_ag_vextent_small( else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { - if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno))) + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + if (error) goto error0; if (fbno != NULLAGBLOCK) { if (args->userdata) { @@ -1477,8 +1462,10 @@ xfs_alloc_ag_vextent_small( /* * Can't allocate from the freelist for some reason. */ - else + else { + fbno = NULLAGBLOCK; flen = 0; + } /* * Can't do the allocation, give up. */ @@ -1513,9 +1500,6 @@ xfs_free_ag_extent( xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ int error; /* error return value */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_free_ag_extent"; -#endif xfs_agblock_t gtbno; /* start of right neighbor block */ xfs_extlen_t gtlen; /* length of right neighbor block */ int haveleft; /* have a left neighbor block */ @@ -1735,7 +1719,7 @@ xfs_free_ag_extent( agf = XFS_BUF_TO_AGF(agbp); pag = &mp->m_perag[agno]; - be32_add(&agf->agf_freeblks, len); + be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); pag->pagf_freeblks += len; XFS_WANT_CORRUPTED_GOTO( @@ -1921,7 +1905,8 @@ xfs_alloc_fix_freelist( while (be32_to_cpu(agf->agf_flcount) > need) { xfs_buf_t *bp; - if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) + error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + if (error) return error; if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) return error; @@ -1971,8 +1956,9 @@ xfs_alloc_fix_freelist( * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp, - bno))) + error = xfs_alloc_put_freelist(tp, agbp, + agflbp, bno, 0); + if (error) return error; } } @@ -1989,16 +1975,15 @@ int /* error */ xfs_alloc_get_freelist( xfs_trans_t *tp, /* transaction pointer */ xfs_buf_t *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop) /* block address retrieved from freelist */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk) /* destination is a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ xfs_agfl_t *agfl; /* a.g. freelist structure */ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ int error; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_get_freelist"; -#endif + int logflags; xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ @@ -2021,17 +2006,25 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - bno = INT_GET(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)], ARCH_CONVERT); - be32_add(&agf->agf_flfirst, 1); + bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) agf->agf_flfirst = 0; pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; - be32_add(&agf->agf_flcount, -1); + be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; - TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); + + logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, 1); + pag->pagf_btreeblks++; + logflags |= XFS_AGF_BTREEBLKS; + } + + TRACE_MODAGF(NULL, agf, logflags); + xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; /* @@ -2069,6 +2062,7 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_flcount), offsetof(xfs_agf_t, agf_freeblks), offsetof(xfs_agf_t, agf_longest), + offsetof(xfs_agf_t, agf_btreeblks), sizeof(xfs_agf_t) }; @@ -2104,15 +2098,14 @@ xfs_alloc_put_freelist( xfs_trans_t *tp, /* transaction pointer */ xfs_buf_t *agbp, /* buffer for a.g. freelist header */ xfs_buf_t *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno) /* block being freed */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk) /* block came from a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ xfs_agfl_t *agfl; /* a.g. free block array */ - xfs_agblock_t *blockp;/* pointer to array entry */ + __be32 *blockp;/* pointer to array entry */ int error; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_put_freelist"; -#endif + int logflags; xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ @@ -2123,18 +2116,29 @@ xfs_alloc_put_freelist( be32_to_cpu(agf->agf_seqno), &agflbp))) return error; agfl = XFS_BUF_TO_AGFL(agflbp); - be32_add(&agf->agf_fllast, 1); + be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; - be32_add(&agf->agf_flcount, 1); + be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; + + logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, -1); + pag->pagf_btreeblks--; + logflags |= XFS_AGF_BTREEBLKS; + } + + TRACE_MODAGF(NULL, agf, logflags); + xfs_alloc_log_agf(tp, agbp, logflags); + ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; - INT_SET(*blockp, ARCH_CONVERT, bno); - TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); + *blockp = cpu_to_be32(bno); + TRACE_MODAGF(NULL, agf, logflags); + xfs_alloc_log_agf(tp, agbp, logflags); xfs_trans_log_buf(tp, agflbp, (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + @@ -2194,13 +2198,14 @@ xfs_alloc_read_agf( pag = &mp->m_perag[agno]; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); + pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); pag->pagf_longest = be32_to_cpu(agf->agf_longest); pag->pagf_levels[XFS_BTNUM_BNOi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - spinlock_init(&pag->pagb_lock, "xfspagb"); + spin_lock_init(&pag->pagb_lock); pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * sizeof(xfs_perag_busy_t), KM_SLEEP); pag->pagf_init = 1; @@ -2233,9 +2238,6 @@ xfs_alloc_vextent( xfs_agblock_t agsize; /* allocation group size */ int error; int flags; /* XFS_ALLOC_FLAG_... locking flags */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_vextent"; -#endif xfs_extlen_t minleft;/* minimum left value, temp copy */ xfs_mount_t *mp; /* mount structure pointer */ xfs_agnumber_t sagno; /* starting allocation group number */ @@ -2497,10 +2499,9 @@ xfs_alloc_mark_busy(xfs_trans_t *tp, xfs_mount_t *mp; xfs_perag_busy_t *bsy; int n; - SPLDECL(s); mp = tp->t_mountp; - s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + spin_lock(&mp->m_perag[agno].pagb_lock); /* search pagb_list for an open slot */ for (bsy = mp->m_perag[agno].pagb_list, n = 0; @@ -2530,7 +2531,7 @@ xfs_alloc_mark_busy(xfs_trans_t *tp, xfs_trans_set_sync(tp); } - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + spin_unlock(&mp->m_perag[agno].pagb_lock); } void @@ -2540,11 +2541,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp, { xfs_mount_t *mp; xfs_perag_busy_t *list; - SPLDECL(s); mp = tp->t_mountp; - s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + spin_lock(&mp->m_perag[agno].pagb_lock); list = mp->m_perag[agno].pagb_list; ASSERT(idx < XFS_PAGB_NUM_SLOTS); @@ -2556,14 +2556,15 @@ xfs_alloc_clear_busy(xfs_trans_t *tp, TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp); } - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + spin_unlock(&mp->m_perag[agno].pagb_lock); } /* - * returns non-zero if any of (agno,bno):len is in a busy list + * If we find the extent in the busy list, force the log out to get the + * extent out of the busy list so the caller can use it straight away. */ -STATIC int +STATIC void xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, @@ -2571,35 +2572,30 @@ xfs_alloc_search_busy(xfs_trans_t *tp, { xfs_mount_t *mp; xfs_perag_busy_t *bsy; - int n; xfs_agblock_t uend, bend; xfs_lsn_t lsn; int cnt; - SPLDECL(s); mp = tp->t_mountp; - s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + spin_lock(&mp->m_perag[agno].pagb_lock); cnt = mp->m_perag[agno].pagb_count; uend = bno + len - 1; /* search pagb_list for this slot, skipping open slots */ - for (bsy = mp->m_perag[agno].pagb_list, n = 0; - cnt; bsy++, n++) { + for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { /* * (start1,length1) within (start2, length2) */ if (bsy->busy_tp != NULL) { bend = bsy->busy_start + bsy->busy_length - 1; - if ((bno > bend) || - (uend < bsy->busy_start)) { + if ((bno > bend) || (uend < bsy->busy_start)) { cnt--; } else { TRACE_BUSYSEARCH("xfs_alloc_search_busy", - "found1", agno, bno, len, n, - tp); + "found1", agno, bno, len, tp); break; } } @@ -2610,15 +2606,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp, * transaction that freed the block */ if (cnt) { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp); + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp); lsn = bsy->busy_tp->t_commit_lsn; - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + spin_unlock(&mp->m_perag[agno].pagb_lock); xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); } else { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp); - n = -1; - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp); + spin_unlock(&mp->m_perag[agno].pagb_lock); } - - return n; } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 5a42561..5aec15d 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -136,7 +136,8 @@ int /* error */ xfs_alloc_get_freelist( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop); /* block address retrieved from freelist */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk); /* destination is a AGF btree */ /* * Log the given fields from the agf structure. @@ -165,7 +166,8 @@ xfs_alloc_put_freelist( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for a.g. freelist header */ struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno); /* block being freed */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk); /* owner was a AGF btree */ /* * Read in the allocation group header (free/alloc section). diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 7446556..3ce2645 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -92,6 +92,7 @@ xfs_alloc_delrec( xfs_alloc_key_t *rkp; /* right block key pointer */ xfs_alloc_ptr_t *rpp; /* right block address pointer */ int rrecs=0; /* number of records in right block */ + int numrecs; xfs_alloc_rec_t *rrp; /* right block record pointer */ xfs_btree_cur_t *tcur; /* temporary btree cursor */ @@ -115,7 +116,8 @@ xfs_alloc_delrec( /* * Fail if we're off the end of the block. */ - if (ptr > be16_to_cpu(block->bb_numrecs)) { + numrecs = be16_to_cpu(block->bb_numrecs); + if (ptr > numrecs) { *stat = 0; return 0; } @@ -129,18 +131,18 @@ xfs_alloc_delrec( lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur); lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur); #ifdef DEBUG - for (i = ptr; i < be16_to_cpu(block->bb_numrecs); i++) { + for (i = ptr; i < numrecs; i++) { if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) return error; } #endif - if (ptr < be16_to_cpu(block->bb_numrecs)) { + if (ptr < numrecs) { memmove(&lkp[ptr - 1], &lkp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lkp)); + (numrecs - ptr) * sizeof(*lkp)); memmove(&lpp[ptr - 1], &lpp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lpp)); - xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); - xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); + (numrecs - ptr) * sizeof(*lpp)); + xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1); + xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1); } } /* @@ -149,10 +151,10 @@ xfs_alloc_delrec( */ else { lrp = XFS_ALLOC_REC_ADDR(block, 1, cur); - if (ptr < be16_to_cpu(block->bb_numrecs)) { + if (ptr < numrecs) { memmove(&lrp[ptr - 1], &lrp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lrp)); - xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); + (numrecs - ptr) * sizeof(*lrp)); + xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1); } /* * If it's the first record in the block, we'll need a key @@ -167,7 +169,8 @@ xfs_alloc_delrec( /* * Decrement and log the number of entries in the block. */ - be16_add(&block->bb_numrecs, -1); + numrecs--; + block->bb_numrecs = cpu_to_be16(numrecs); xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); /* * See if the longest free extent in the allocation group was @@ -181,14 +184,14 @@ xfs_alloc_delrec( if (level == 0 && cur->bc_btnum == XFS_BTNUM_CNT && be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - ptr > be16_to_cpu(block->bb_numrecs)) { - ASSERT(ptr == be16_to_cpu(block->bb_numrecs) + 1); + ptr > numrecs) { + ASSERT(ptr == numrecs + 1); /* * There are still records in the block. Grab the size * from the last one. */ - if (be16_to_cpu(block->bb_numrecs)) { - rrp = XFS_ALLOC_REC_ADDR(block, be16_to_cpu(block->bb_numrecs), cur); + if (numrecs) { + rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur); agf->agf_longest = rrp->ar_blockcount; } /* @@ -211,20 +214,21 @@ xfs_alloc_delrec( * and it's NOT the leaf level, * then we can get rid of this level. */ - if (be16_to_cpu(block->bb_numrecs) == 1 && level > 0) { + if (numrecs == 1 && level > 0) { /* * lpp is still set to the first pointer in the block. * Make it the new root of the btree. */ bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]); agf->agf_roots[cur->bc_btnum] = *lpp; - be32_add(&agf->agf_levels[cur->bc_btnum], -1); + be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1); mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--; /* * Put this buffer/block on the ag's freelist. */ - if ((error = xfs_alloc_put_freelist(cur->bc_tp, - cur->bc_private.a.agbp, NULL, bno))) + error = xfs_alloc_put_freelist(cur->bc_tp, + cur->bc_private.a.agbp, NULL, bno, 1); + if (error) return error; /* * Since blocks move to the free list without the @@ -267,7 +271,7 @@ xfs_alloc_delrec( * If the number of records remaining in the block is at least * the minimum, we're done. */ - if (be16_to_cpu(block->bb_numrecs) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) return error; *stat = 1; @@ -419,19 +423,21 @@ xfs_alloc_delrec( * See if we can join with the left neighbor block. */ if (lbno != NULLAGBLOCK && - lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { /* * Set "right" to be the starting block, * "left" to be the left neighbor. */ rbno = bno; right = block; + rrecs = be16_to_cpu(right->bb_numrecs); rbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, lbno, 0, &lbp, XFS_ALLOC_BTREE_REF))) return error; left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + lrecs = be16_to_cpu(left->bb_numrecs); if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) return error; } @@ -439,20 +445,21 @@ xfs_alloc_delrec( * If that won't work, see if we can join with the right neighbor block. */ else if (rbno != NULLAGBLOCK && - rrecs + be16_to_cpu(block->bb_numrecs) <= - XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { /* * Set "left" to be the starting block, * "right" to be the right neighbor. */ lbno = bno; left = block; + lrecs = be16_to_cpu(left->bb_numrecs); lbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, rbno, 0, &rbp, XFS_ALLOC_BTREE_REF))) return error; right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + rrecs = be16_to_cpu(right->bb_numrecs); if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) return error; } @@ -474,34 +481,28 @@ xfs_alloc_delrec( /* * It's a non-leaf. Move keys and pointers. */ - lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); - lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); + lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur); + lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur); rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); #ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { + for (i = 0; i < rrecs; i++) { if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) return error; } #endif - memcpy(lkp, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*lkp)); - memcpy(lpp, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*lpp)); - xfs_alloc_log_keys(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); - xfs_alloc_log_ptrs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); + memcpy(lkp, rkp, rrecs * sizeof(*lkp)); + memcpy(lpp, rpp, rrecs * sizeof(*lpp)); + xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); } else { /* * It's a leaf. Move records. */ - lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); + lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur); rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memcpy(lrp, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*lrp)); - xfs_alloc_log_recs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); + memcpy(lrp, rrp, rrecs * sizeof(*lrp)); + xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); } /* * If we joined with the left neighbor, set the buffer in the @@ -509,7 +510,7 @@ xfs_alloc_delrec( */ if (bp != lbp) { xfs_btree_setbuf(cur, level, lbp); - cur->bc_ptrs[level] += be16_to_cpu(left->bb_numrecs); + cur->bc_ptrs[level] += lrecs; } /* * If we joined with the right neighbor and there's a level above @@ -521,7 +522,8 @@ xfs_alloc_delrec( /* * Fix up the number of records in the surviving block. */ - be16_add(&left->bb_numrecs, be16_to_cpu(right->bb_numrecs)); + lrecs += rrecs; + left->bb_numrecs = cpu_to_be16(lrecs); /* * Fix up the right block pointer in the surviving block, and log it. */ @@ -548,8 +550,9 @@ xfs_alloc_delrec( /* * Free the deleting block by putting it on the freelist. */ - if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp, - NULL, rbno))) + error = xfs_alloc_put_freelist(cur->bc_tp, + cur->bc_private.a.agbp, NULL, rbno, 1); + if (error) return error; /* * Since blocks move to the free list without the coordination @@ -608,6 +611,7 @@ xfs_alloc_insrec( xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ xfs_alloc_key_t nkey; /* new key value, from split */ xfs_alloc_rec_t nrec; /* new record value, for caller */ + int numrecs; int optr; /* old ptr value */ xfs_alloc_ptr_t *pp; /* pointer to btree addresses */ int ptr; /* index in btree block for this rec */ @@ -653,13 +657,14 @@ xfs_alloc_insrec( */ bp = cur->bc_bufs[level]; block = XFS_BUF_TO_ALLOC_BLOCK(bp); + numrecs = be16_to_cpu(block->bb_numrecs); #ifdef DEBUG if ((error = xfs_btree_check_sblock(cur, block, level, bp))) return error; /* * Check that the new entry is being inserted in the right place. */ - if (ptr <= be16_to_cpu(block->bb_numrecs)) { + if (ptr <= numrecs) { if (level == 0) { rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); xfs_btree_check_rec(cur->bc_btnum, recp, rp); @@ -670,12 +675,12 @@ xfs_alloc_insrec( } #endif nbno = NULLAGBLOCK; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; /* * If the block is full, we can't insert the new entry until we * make the block un-full. */ - if (be16_to_cpu(block->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { /* * First, try shifting an entry to the right neighbor. */ @@ -729,6 +734,7 @@ xfs_alloc_insrec( * At this point we know there's room for our new entry in the block * we're pointing at. */ + numrecs = be16_to_cpu(block->bb_numrecs); if (level > 0) { /* * It's a non-leaf entry. Make a hole for the new data @@ -737,15 +743,15 @@ xfs_alloc_insrec( kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); #ifdef DEBUG - for (i = be16_to_cpu(block->bb_numrecs); i >= ptr; i--) { + for (i = numrecs; i >= ptr; i--) { if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level))) return error; } #endif memmove(&kp[ptr], &kp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*kp)); + (numrecs - ptr + 1) * sizeof(*kp)); memmove(&pp[ptr], &pp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*pp)); + (numrecs - ptr + 1) * sizeof(*pp)); #ifdef DEBUG if ((error = xfs_btree_check_sptr(cur, *bnop, level))) return error; @@ -755,11 +761,12 @@ xfs_alloc_insrec( */ kp[ptr - 1] = key; pp[ptr - 1] = cpu_to_be32(*bnop); - be16_add(&block->bb_numrecs, 1); - xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); - xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); + numrecs++; + block->bb_numrecs = cpu_to_be16(numrecs); + xfs_alloc_log_keys(cur, bp, ptr, numrecs); + xfs_alloc_log_ptrs(cur, bp, ptr, numrecs); #ifdef DEBUG - if (ptr < be16_to_cpu(block->bb_numrecs)) + if (ptr < numrecs) xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, kp + ptr); #endif @@ -769,16 +776,17 @@ xfs_alloc_insrec( */ rp = XFS_ALLOC_REC_ADDR(block, 1, cur); memmove(&rp[ptr], &rp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*rp)); + (numrecs - ptr + 1) * sizeof(*rp)); /* * Now stuff the new record in, bump numrecs * and log the new data. */ - rp[ptr - 1] = *recp; /* INT_: struct copy */ - be16_add(&block->bb_numrecs, 1); - xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); + rp[ptr - 1] = *recp; + numrecs++; + block->bb_numrecs = cpu_to_be16(numrecs); + xfs_alloc_log_recs(cur, bp, ptr, numrecs); #ifdef DEBUG - if (ptr < be16_to_cpu(block->bb_numrecs)) + if (ptr < numrecs) xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, rp + ptr); #endif @@ -819,8 +827,8 @@ xfs_alloc_insrec( */ *bnop = nbno; if (nbno != NULLAGBLOCK) { - *recp = nrec; /* INT_: struct copy */ - *curp = ncur; /* INT_: struct copy */ + *recp = nrec; + *curp = ncur; } *stat = 1; return 0; @@ -981,7 +989,7 @@ xfs_alloc_lookup( */ bp = cur->bc_bufs[level]; if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; + bp = NULL; if (!bp) { /* * Need to get a new buffer. Read it, then @@ -1229,7 +1237,7 @@ xfs_alloc_lshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) return error; #endif - *lpp = *rpp; /* INT_: copy */ + *lpp = *rpp; xfs_alloc_log_ptrs(cur, lbp, nrec, nrec); xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); } @@ -1248,9 +1256,9 @@ xfs_alloc_lshift( /* * Bump and log left's numrecs, decrement and log right's numrecs. */ - be16_add(&left->bb_numrecs, 1); + be16_add_cpu(&left->bb_numrecs, 1); xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, -1); + be16_add_cpu(&right->bb_numrecs, -1); xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); /* * Slide the contents of right down one entry. @@ -1314,8 +1322,9 @@ xfs_alloc_newroot( /* * Get a buffer from the freelist blocks, for the new root. */ - if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, - &nbno))) + error = xfs_alloc_get_freelist(cur->bc_tp, + cur->bc_private.a.agbp, &nbno, 1); + if (error) return error; /* * None available, we fail. @@ -1337,7 +1346,7 @@ xfs_alloc_newroot( agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno); - be32_add(&agf->agf_levels[cur->bc_btnum], 1); + be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1); seqno = be32_to_cpu(agf->agf_seqno); mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++; xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, @@ -1406,8 +1415,8 @@ xfs_alloc_newroot( kp = XFS_ALLOC_KEY_ADDR(new, 1, cur); if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */ - kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */ + kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); + kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur); } else { xfs_alloc_rec_t *rp; /* btree record pointer */ @@ -1527,8 +1536,8 @@ xfs_alloc_rshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) return error; #endif - *rkp = *lkp; /* INT_: copy */ - *rpp = *lpp; /* INT_: copy */ + *rkp = *lkp; + *rpp = *lpp; xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); @@ -1549,9 +1558,9 @@ xfs_alloc_rshift( /* * Decrement and log left's numrecs, bump and log right's numrecs. */ - be16_add(&left->bb_numrecs, -1); + be16_add_cpu(&left->bb_numrecs, -1); xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, 1); + be16_add_cpu(&right->bb_numrecs, 1); xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); /* * Using a temporary cursor, update the parent key values of the @@ -1598,8 +1607,9 @@ xfs_alloc_split( * Allocate the new block from the freelist. * If we can't do it, we're toast. Give up. */ - if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, - &rbno))) + error = xfs_alloc_get_freelist(cur->bc_tp, + cur->bc_private.a.agbp, &rbno, 1); + if (error) return error; if (rbno == NULLAGBLOCK) { *stat = 0; @@ -1633,7 +1643,7 @@ xfs_alloc_split( */ if ((be16_to_cpu(left->bb_numrecs) & 1) && cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add(&right->bb_numrecs, 1); + be16_add_cpu(&right->bb_numrecs, 1); i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; /* * For non-leaf blocks, copy keys and addresses over to the new block. @@ -1679,7 +1689,7 @@ xfs_alloc_split( * Adjust numrecs, sibling pointers. */ lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp)); - be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); + be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); right->bb_rightsib = left->bb_rightsib; left->bb_rightsib = cpu_to_be32(rbno); right->bb_leftsib = cpu_to_be32(lbno); @@ -2044,7 +2054,7 @@ xfs_alloc_insert( nbno = NULLAGBLOCK; nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; pcur = cur; /* * Loop going up the tree, starting at the leaf level. @@ -2076,7 +2086,7 @@ xfs_alloc_insert( */ if (ncur) { pcur = ncur; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; } } while (nbno != NULLAGBLOCK); *stat = i; diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index bce81c7..5bd1a2c 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -58,7 +58,6 @@ typedef struct xfs_btree_sblock xfs_alloc_block_t; /* * Real block structures have a size equal to the disk block size. */ -#define XFS_ALLOC_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) #define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0]) #define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0]) @@ -87,16 +86,13 @@ typedef struct xfs_btree_sblock xfs_alloc_block_t; * Record, key, and pointer address macros for btree blocks. */ #define XFS_ALLOC_REC_ADDR(bb,i,cur) \ - XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(0, cur)) + XFS_BTREE_REC_ADDR(xfs_alloc, bb, i) #define XFS_ALLOC_KEY_ADDR(bb,i,cur) \ - XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) + XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i) #define XFS_ALLOC_PTR_ADDR(bb,i,cur) \ - XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) + XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) /* * Decrement cursor by one record at the level. diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h index c483689..0b3b5ef 100644 --- a/fs/xfs/xfs_arch.h +++ b/fs/xfs/xfs_arch.h @@ -92,16 +92,6 @@ ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ } -/* define generic INT_ macros */ - -#define INT_GET(reference,arch) \ - (((arch) == ARCH_NOCONVERT) \ - ? \ - (reference) \ - : \ - INT_SWAP((reference),(reference)) \ - ) - /* does not return a value */ #define INT_SET(reference,arch,valueref) \ (__builtin_constant_p(valueref) ? \ @@ -112,79 +102,6 @@ ) \ ) -/* does not return a value */ -#define INT_MOD_EXPR(reference,arch,code) \ - (((arch) == ARCH_NOCONVERT) \ - ? \ - (void)((reference) code) \ - : \ - (void)( \ - (reference) = INT_GET((reference),arch) , \ - ((reference) code), \ - INT_SET(reference, arch, reference) \ - ) \ - ) - -/* does not return a value */ -#define INT_MOD(reference,arch,delta) \ - (void)( \ - INT_MOD_EXPR(reference,arch,+=(delta)) \ - ) - -/* - * INT_COPY - copy a value between two locations with the - * _same architecture_ but _potentially different sizes_ - * - * if the types of the two parameters are equal or they are - * in native architecture, a simple copy is done - * - * otherwise, architecture conversions are done - * - */ - -/* does not return a value */ -#define INT_COPY(dst,src,arch) \ - ( \ - ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \ - ? \ - (void)((dst) = (src)) \ - : \ - INT_SET(dst, arch, INT_GET(src, arch)) \ - ) - -/* - * INT_XLATE - copy a value in either direction between two locations - * with different architectures - * - * dir < 0 - copy from memory to buffer (native to arch) - * dir > 0 - copy from buffer to memory (arch to native) - */ - -/* does not return a value */ -#define INT_XLATE(buf,mem,dir,arch) {\ - ASSERT(dir); \ - if (dir>0) { \ - (mem)=INT_GET(buf, arch); \ - } else { \ - INT_SET(buf, arch, mem); \ - } \ -} - -static inline void be16_add(__be16 *a, __s16 b) -{ - *a = cpu_to_be16(be16_to_cpu(*a) + b); -} - -static inline void be32_add(__be32 *a, __s32 b) -{ - *a = cpu_to_be32(be32_to_cpu(*a) + b); -} - -static inline void be64_add(__be64 *a, __s64 b) -{ - *a = cpu_to_be64(be64_to_cpu(*a) + b); -} - /* * In directories inode numbers are stored as unaligned arrays of unsigned * 8bit integers on disk. diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1a21010..f7cdc28 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -16,8 +16,6 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <linux/capability.h> - #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -49,6 +47,7 @@ #include "xfs_trans_space.h" #include "xfs_acl.h" #include "xfs_rw.h" +#include "xfs_vnodeops.h" /* * xfs_attr.c @@ -56,11 +55,6 @@ * Provide the external interfaces to manage attribute lists. */ -#define ATTR_SYSCOUNT 2 -STATIC struct attrnames posix_acl_access; -STATIC struct attrnames posix_acl_default; -STATIC struct attrnames *attr_system_names[ATTR_SYSCOUNT]; - /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -91,7 +85,6 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state); /* * Routines to manipulate out-of-line attribute values. */ -STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args); STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); @@ -101,29 +94,52 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); ktrace_t *xfs_attr_trace_buf; #endif +STATIC int +xfs_attr_name_to_xname( + struct xfs_name *xname, + const char *aname) +{ + if (!aname) + return EINVAL; + xname->name = aname; + xname->len = strlen(aname); + if (xname->len >= MAXNAMELEN) + return EFAULT; /* match IRIX behaviour */ + + return 0; +} + +STATIC int +xfs_inode_hasattr( + struct xfs_inode *ip) +{ + if (!XFS_IFORK_Q(ip) || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) + return 0; + return 1; +} /*======================================================================== * Overall external interface routines. *========================================================================*/ int -xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen, - char *value, int *valuelenp, int flags, struct cred *cred) +xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name, + char *value, int *valuelenp, int flags) { xfs_da_args_t args; int error; - if ((XFS_IFORK_Q(ip) == 0) || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) - return(ENOATTR); + if (!xfs_inode_hasattr(ip)) + return ENOATTR; /* * Fill in the arg structure for this request. */ memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; + args.name = name->name; + args.namelen = name->len; args.value = value; args.valuelen = *valuelenp; args.flags = flags; @@ -134,11 +150,7 @@ xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen, /* * Decide on what work routines to call based on the inode size. */ - if (XFS_IFORK_Q(ip) == 0 || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) { - error = XFS_ERROR(ENOATTR); - } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { error = xfs_attr_shortform_getvalue(&args); } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { error = xfs_attr_leaf_get(&args); @@ -157,41 +169,82 @@ xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen, } int -xfs_attr_get(bhv_desc_t *bdp, const char *name, char *value, int *valuelenp, - int flags, struct cred *cred) +xfs_attr_get( + xfs_inode_t *ip, + const char *name, + char *value, + int *valuelenp, + int flags) { - xfs_inode_t *ip = XFS_BHVTOI(bdp); - int error, namelen; + int error; + struct xfs_name xname; XFS_STATS_INC(xs_attr_get); - if (!name) - return(EINVAL); - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return(EFAULT); /* match IRIX behaviour */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return(EIO); + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred); + error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); return(error); } +/* + * Calculate how many blocks we need for the new attribute, + */ +int +xfs_attr_calc_size( + struct xfs_inode *ip, + int namelen, + int valuelen, + int *local) +{ + struct xfs_mount *mp = ip->i_mount; + int size; + int nblks; + + /* + * Determine space new attribute will use, and if it would be + * "local" or "remote" (note: local != inline). + */ + size = xfs_attr_leaf_newentsize(namelen, valuelen, + mp->m_sb.sb_blocksize, local); + + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (*local) { + if (size > (mp->m_sb.sb_blocksize >> 1)) { + /* Double split possible */ + nblks *= 2; + } + } else { + /* + * Out of line attribute, cannot double split, but + * make room for the attribute value itself. + */ + uint dblocks = XFS_B_TO_FSB(mp, valuelen); + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } + + return nblks; +} + STATIC int -xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, - char *value, int valuelen, int flags) +xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, + char *value, int valuelen, int flags) { xfs_da_args_t args; xfs_fsblock_t firstblock; xfs_bmap_free_t flist; int error, err2, committed; - int local, size; - uint nblks; xfs_mount_t *mp = dp->i_mount; int rsvd = (flags & ATTR_ROOT) != 0; + int local; /* * Attach the dquots to the inode. @@ -200,18 +253,14 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, return (error); /* - * Determine space new attribute will use, and if it would be - * "local" or "remote" (note: local != inline). - */ - size = xfs_attr_leaf_newentsize(namelen, valuelen, - mp->m_sb.sb_blocksize, &local); - - /* * If the inode doesn't have an attribute fork, add one. * (inode must not be locked when we call this routine) */ if (XFS_IFORK_Q(dp) == 0) { - if ((error = xfs_bmap_add_attrfork(dp, size, rsvd))) + int sf_size = sizeof(xfs_attr_sf_hdr_t) + + XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen); + + if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd))) return(error); } @@ -219,8 +268,8 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, * Fill in the arg structure for this request. */ memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; + args.name = name->name; + args.namelen = name->len; args.value = value; args.valuelen = valuelen; args.flags = flags; @@ -229,26 +278,10 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, args.firstblock = &firstblock; args.flist = &flist; args.whichfork = XFS_ATTR_FORK; - args.addname = 1; - args.oknoent = 1; - - nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); - if (local) { - if (size > (mp->m_sb.sb_blocksize >> 1)) { - /* Double split possible */ - nblks <<= 1; - } - } else { - uint dblocks = XFS_B_TO_FSB(mp, valuelen); - /* Out of line attribute, cannot double split, but make - * room for the attribute value itself. - */ - nblks += dblocks; - nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); - } + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; /* Size is now blocks for attribute data */ - args.total = nblks; + args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local); /* * Start our first transaction of the day. @@ -270,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, (uint) nblks, - XFS_ATTRSET_LOG_RES(mp, nblks), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ATTRSET_LOG_COUNT))) { + if ((error = xfs_trans_reserve(args.trans, args.total, + XFS_ATTRSET_LOG_RES(mp, args.total), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { xfs_trans_cancel(args.trans, 0); return(error); } xfs_ilock(dp, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); + error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0, + rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); @@ -326,8 +358,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, xfs_trans_set_sync(args.trans); } err2 = xfs_trans_commit(args.trans, - XFS_TRANS_RELEASE_LOG_RES, - NULL); + XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); /* @@ -347,7 +378,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, error = xfs_attr_shortform_to_leaf(&args); if (!error) { error = xfs_bmap_finish(&args.trans, args.flist, - *args.firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -369,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ - if ((error = xfs_attr_rolltrans(&args.trans, dp))) + + error = xfs_trans_roll(&args.trans, dp); + if (error) goto out; } @@ -395,8 +428,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, - NULL); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); /* @@ -417,23 +449,26 @@ out: } int -xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int flags, - struct cred *cred) +xfs_attr_set( + xfs_inode_t *dp, + const char *name, + char *value, + int valuelen, + int flags) { - xfs_inode_t *dp; - int namelen; - - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return EFAULT; /* match IRIX behaviour */ + int error; + struct xfs_name xname; XFS_STATS_INC(xs_attr_set); - dp = XFS_BHVTOI(bdp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return (EIO); - return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags); + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + + return xfs_attr_set_int(dp, &xname, value, valuelen, flags); } /* @@ -441,7 +476,7 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f * Transitions attribute list from Btree to shortform as necessary. */ STATIC int -xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) +xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) { xfs_da_args_t args; xfs_fsblock_t firstblock; @@ -453,8 +488,8 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) * Fill in the arg structure for this request. */ memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; + args.name = name->name; + args.namelen = name->len; args.flags = flags; args.hashval = xfs_da_hashname(args.name, args.namelen); args.dp = dp; @@ -509,9 +544,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) /* * Decide on what work routines to call based on the inode size. */ - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp)) { error = XFS_ERROR(ENOATTR); goto out; } @@ -542,8 +575,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, - NULL); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); /* @@ -564,31 +596,121 @@ out: } int -xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred) +xfs_attr_remove( + xfs_inode_t *dp, + const char *name, + int flags) { - xfs_inode_t *dp; - int namelen; - - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return EFAULT; /* match IRIX behaviour */ + int error; + struct xfs_name xname; XFS_STATS_INC(xs_attr_remove); - dp = XFS_BHVTOI(bdp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return (EIO); + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + xfs_ilock(dp, XFS_ILOCK_SHARED); - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp)) { xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } xfs_iunlock(dp, XFS_ILOCK_SHARED); - return xfs_attr_remove_int(dp, name, namelen, flags); + return xfs_attr_remove_int(dp, &xname, flags); +} + +int +xfs_attr_list_int(xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + xfs_ilock(dp, XFS_ILOCK_SHARED); + xfs_attr_trace_l_c("syscall start", context); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_attr_trace_l_c("syscall end", context); + + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +/*ARGSUSED*/ +STATIC int +xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, + char *name, int namelen, + int valuelen, char *value) +{ + struct attrlist *alist = (struct attrlist *)context->alist; + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + xfs_attr_trace_l_c("buffer full", context); + alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&context->alist[context->firstu]; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + xfs_attr_trace_l_c("add", context); + return 0; } /* @@ -599,15 +721,17 @@ xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred) * success. */ int -xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, - attrlist_cursor_kern_t *cursor, struct cred *cred) +xfs_attr_list( + xfs_inode_t *dp, + char *buffer, + int bufsize, + int flags, + attrlist_cursor_kern_t *cursor) { xfs_attr_list_context_t context; - xfs_inode_t *dp; + struct attrlist *alist; int error; - XFS_STATS_INC(xs_attr_list); - /* * Validate the cursor. */ @@ -615,72 +739,37 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, return(XFS_ERROR(EINVAL)); if ((cursor->initted == 0) && (cursor->hashval || cursor->blkno || cursor->offset)) - return(XFS_ERROR(EINVAL)); + return XFS_ERROR(EINVAL); /* * Check for a properly aligned buffer. */ if (((long)buffer) & (sizeof(int)-1)) - return(XFS_ERROR(EFAULT)); + return XFS_ERROR(EFAULT); if (flags & ATTR_KERNOVAL) bufsize = 0; /* * Initialize the output buffer. */ - context.dp = dp = XFS_BHVTOI(bdp); + memset(&context, 0, sizeof(context)); + context.dp = dp; context.cursor = cursor; - context.count = 0; - context.dupcnt = 0; context.resynch = 1; context.flags = flags; - if (!(flags & ATTR_KERNAMELS)) { - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.alist = (attrlist_t *)buffer; - context.alist->al_count = 0; - context.alist->al_more = 0; - context.alist->al_offset[0] = context.bufsize; - } - else { - context.bufsize = bufsize; - context.firstu = context.bufsize; - context.alist = (attrlist_t *)buffer; - } - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); - - xfs_ilock(dp, XFS_ILOCK_SHARED); - /* - * Decide on what work routines to call based on the inode size. - */ - xfs_attr_trace_l_c("syscall start", &context); - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - error = 0; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = xfs_attr_shortform_list(&context); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_list(&context); - } else { - error = xfs_attr_node_list(&context); - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - xfs_attr_trace_l_c("syscall end", &context); - - if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) { - ASSERT(error >= 0); - } - else { /* must return negated buffer size or the error */ - if (context.count < 0) - error = XFS_ERROR(ERANGE); - else - error = -context.count; - } - - return(error); + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; + + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list_int(&context); + ASSERT(error >= 0); + return error; } int /* error */ @@ -694,12 +783,10 @@ xfs_attr_inactive(xfs_inode_t *dp) ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, XFS_ILOCK_SHARED); - if ((XFS_IFORK_Q(dp) == 0) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(0); + return 0; } xfs_iunlock(dp, XFS_ILOCK_SHARED); @@ -732,10 +819,8 @@ xfs_attr_inactive(xfs_inode_t *dp) /* * Decide on what work routines to call based on the inode size. */ - if ((XFS_IFORK_Q(dp) == 0) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { error = 0; goto out; } @@ -759,8 +844,7 @@ xfs_attr_inactive(xfs_inode_t *dp) * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES, - NULL); + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); return(error); @@ -822,7 +906,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * This leaf block cannot have a "remote" value, we only call this routine * if bmap_one_block() says there is only one block (ie: no remote blks). */ -int +STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; @@ -853,7 +937,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_da_brelse(args->trans, bp); return(retval); } - args->rename = 1; /* an atomic rename */ + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; @@ -876,7 +960,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) error = xfs_attr_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -898,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the current trans (including the inode) and start * a new one. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); /* @@ -912,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the transaction that added the attr name so that * later routines can manage their own transactions. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); /* @@ -933,7 +1019,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * so that one disappears and one appears atomically. Then we * must remove the "old" attribute/value pair. */ - if (args->rename) { + if (args->op_flags & XFS_DA_OP_RENAME) { /* * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. @@ -977,7 +1063,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1002,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) /* * Commit the remove and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, dp); + error = xfs_trans_roll(&args->trans, dp); } else if (args->rmtblkno > 0) { /* @@ -1055,7 +1140,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1122,19 +1207,19 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) context->cursor->blkno = 0; error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); + return XFS_ERROR(error); ASSERT(bp != NULL); leaf = bp->data; if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) { XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); xfs_da_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); + return XFS_ERROR(EFSCORRUPTED); } - (void)xfs_attr_leaf_list_int(bp, context); + error = xfs_attr_leaf_list_int(bp, context); xfs_da_brelse(NULL, bp); - return(0); + return XFS_ERROR(error); } @@ -1187,7 +1272,7 @@ restart: } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) goto out; - args->rename = 1; /* atomic rename op */ + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; @@ -1210,7 +1295,6 @@ restart: if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1234,7 +1318,8 @@ restart: * Commit the node conversion and start the next * trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; goto restart; @@ -1250,7 +1335,7 @@ restart: error = xfs_da_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1285,7 +1370,8 @@ restart: * Commit the leaf addition or btree split and start the next * trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; /* @@ -1306,7 +1392,7 @@ restart: * so that one disappears and one appears atomically. Then we * must remove the "old" attribute/value pair. */ - if (args->rename) { + if (args->op_flags & XFS_DA_OP_RENAME) { /* * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. @@ -1362,7 +1448,6 @@ restart: if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1386,7 +1471,8 @@ restart: /* * Commit and start the next trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; } else if (args->rmtblkno > 0) { @@ -1497,7 +1583,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) error = xfs_da_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1518,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) /* * Commit the Btree join operation and start a new trans. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; } @@ -1549,7 +1636,6 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1858,8 +1944,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) return(XFS_ERROR(EFSCORRUPTED)); } error = xfs_attr_leaf_list_int(bp, context); - if (error || !leaf->hdr.info.forw) - break; /* not really an error, buffer full or EOF */ + if (error) { + xfs_da_brelse(NULL, bp); + return error; + } + if (context->seen_enough || leaf->hdr.info.forw == 0) + break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_da_brelse(NULL, bp); error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, @@ -1886,7 +1976,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) * Read the value associated with an attribute from the out-of-line buffer * that we stored it in. */ -STATIC int +int xfs_attr_rmtval_get(xfs_da_args_t *args) { xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; @@ -1989,7 +2079,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) args->flist, NULL); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -2016,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) /* * Start the next trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); } @@ -2145,7 +2236,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) NULL, &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -2166,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Close out trans and start the next one in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, args->dp))) + error = xfs_trans_roll(&args->trans, args->dp); + if (error) return (error); } return(0); @@ -2179,23 +2271,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context, (__psunsigned_t)NULL, (__psunsigned_t)NULL, (__psunsigned_t)NULL); @@ -2208,23 +2284,7 @@ void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, struct xfs_da_intnode *node) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context, (__psunsigned_t)be16_to_cpu(node->hdr.count), (__psunsigned_t)be32_to_cpu(node->btree[0].hashval), (__psunsigned_t)be32_to_cpu(node->btree[ @@ -2238,23 +2298,7 @@ void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, struct xfs_da_node_entry *btree) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context, (__psunsigned_t)be32_to_cpu(btree->hashval), (__psunsigned_t)be32_to_cpu(btree->before), (__psunsigned_t)NULL); @@ -2267,23 +2311,7 @@ void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, struct xfs_attr_leafblock *leaf) { - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context, (__psunsigned_t)be16_to_cpu(leaf->hdr.count), (__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval), (__psunsigned_t)be32_to_cpu(leaf->entries[ @@ -2296,337 +2324,24 @@ xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, */ void xfs_attr_trace_enter(int type, char *where, - __psunsigned_t a2, __psunsigned_t a3, - __psunsigned_t a4, __psunsigned_t a5, - __psunsigned_t a6, __psunsigned_t a7, - __psunsigned_t a8, __psunsigned_t a9, - __psunsigned_t a10, __psunsigned_t a11, - __psunsigned_t a12, __psunsigned_t a13, - __psunsigned_t a14, __psunsigned_t a15) + struct xfs_attr_list_context *context, + __psunsigned_t a13, __psunsigned_t a14, + __psunsigned_t a15) { ASSERT(xfs_attr_trace_buf); ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type), - (void *)where, - (void *)a2, (void *)a3, (void *)a4, - (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10, - (void *)a11, (void *)a12, (void *)a13, - (void *)a14, (void *)a15); + (void *)((__psunsigned_t)where), + (void *)((__psunsigned_t)context->dp), + (void *)((__psunsigned_t)context->cursor->hashval), + (void *)((__psunsigned_t)context->cursor->blkno), + (void *)((__psunsigned_t)context->cursor->offset), + (void *)((__psunsigned_t)context->alist), + (void *)((__psunsigned_t)context->bufsize), + (void *)((__psunsigned_t)context->count), + (void *)((__psunsigned_t)context->firstu), + NULL, + (void *)((__psunsigned_t)context->dupcnt), + (void *)((__psunsigned_t)context->flags), + (void *)a13, (void *)a14, (void *)a15); } #endif /* XFS_ATTR_TRACE */ - - -/*======================================================================== - * System (pseudo) namespace attribute interface routines. - *========================================================================*/ - -STATIC int -posix_acl_access_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_exists( - bhv_vnode_t *vp) -{ - return xfs_acl_vhasacl_access(vp); -} - -STATIC int -posix_acl_default_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_exists( - bhv_vnode_t *vp) -{ - return xfs_acl_vhasacl_default(vp); -} - -STATIC struct attrnames posix_acl_access = { - .attr_name = "posix_acl_access", - .attr_namelen = sizeof("posix_acl_access") - 1, - .attr_get = posix_acl_access_get, - .attr_set = posix_acl_access_set, - .attr_remove = posix_acl_access_remove, - .attr_exists = posix_acl_access_exists, -}; - -STATIC struct attrnames posix_acl_default = { - .attr_name = "posix_acl_default", - .attr_namelen = sizeof("posix_acl_default") - 1, - .attr_get = posix_acl_default_get, - .attr_set = posix_acl_default_set, - .attr_remove = posix_acl_default_remove, - .attr_exists = posix_acl_default_exists, -}; - -STATIC struct attrnames *attr_system_names[] = - { &posix_acl_access, &posix_acl_default }; - - -/*======================================================================== - * Namespace-prefix-style attribute name interface routines. - *========================================================================*/ - -STATIC int -attr_generic_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return -bhv_vop_attr_set(vp, name, data, size, xflags, NULL); -} - -STATIC int -attr_generic_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - int error, asize = size; - - error = bhv_vop_attr_get(vp, name, data, &asize, xflags, NULL); - if (!error) - return asize; - return -error; -} - -STATIC int -attr_generic_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return -bhv_vop_attr_remove(vp, name, xflags, NULL); -} - -STATIC int -attr_generic_listadd( - attrnames_t *prefix, - attrnames_t *namesp, - void *data, - size_t size, - ssize_t *result) -{ - char *p = data + *result; - - *result += prefix->attr_namelen; - *result += namesp->attr_namelen + 1; - if (!size) - return 0; - if (*result > size) - return -ERANGE; - strcpy(p, prefix->attr_name); - p += prefix->attr_namelen; - strcpy(p, namesp->attr_name); - p += namesp->attr_namelen + 1; - return 0; -} - -STATIC int -attr_system_list( - bhv_vnode_t *vp, - void *data, - size_t size, - ssize_t *result) -{ - attrnames_t *namesp; - int i, error = 0; - - for (i = 0; i < ATTR_SYSCOUNT; i++) { - namesp = attr_system_names[i]; - if (!namesp->attr_exists || !namesp->attr_exists(vp)) - continue; - error = attr_generic_listadd(&attr_system, namesp, - data, size, result); - if (error) - break; - } - return error; -} - -int -attr_generic_list( - bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result) -{ - attrlist_cursor_kern_t cursor = { 0 }; - int error; - - error = bhv_vop_attr_list(vp, data, size, xflags, &cursor, NULL); - if (error > 0) - return -error; - *result = -error; - return attr_system_list(vp, data, size, result); -} - -attrnames_t * -attr_lookup_namespace( - char *name, - struct attrnames **names, - int nnames) -{ - int i; - - for (i = 0; i < nnames; i++) - if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen)) - return names[i]; - return NULL; -} - -/* - * Some checks to prevent people abusing EAs to get over quota: - * - Don't allow modifying user EAs on devices/symlinks; - * - Don't allow modifying user EAs if sticky bit set; - */ -STATIC int -attr_user_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - struct inode *inode = vn_to_inode(vp); - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; - return 0; -} - -STATIC int -attr_trusted_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - struct inode *inode = vn_to_inode(vp); - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - return 0; -} - -STATIC int -attr_secure_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - return -ENOSECURITY; -} - -STATIC int -attr_system_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - attrnames_t *namesp; - int error; - - if (xflags & ATTR_CREATE) - return -EINVAL; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - error = namesp->attr_set(vp, name, data, size, xflags); - if (!error) - error = vn_revalidate(vp); - return error; -} - -STATIC int -attr_system_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - attrnames_t *namesp; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - return namesp->attr_get(vp, name, data, size, xflags); -} - -STATIC int -attr_system_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - attrnames_t *namesp; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - return namesp->attr_remove(vp, name, xflags); -} - -struct attrnames attr_system = { - .attr_name = "system.", - .attr_namelen = sizeof("system.") - 1, - .attr_flag = ATTR_SYSTEM, - .attr_get = attr_system_get, - .attr_set = attr_system_set, - .attr_remove = attr_system_remove, - .attr_capable = (attrcapable_t)fs_noerr, -}; - -struct attrnames attr_trusted = { - .attr_name = "trusted.", - .attr_namelen = sizeof("trusted.") - 1, - .attr_flag = ATTR_ROOT, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_trusted_capable, -}; - -struct attrnames attr_secure = { - .attr_name = "security.", - .attr_namelen = sizeof("security.") - 1, - .attr_flag = ATTR_SECURE, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_secure_capable, -}; - -struct attrnames attr_user = { - .attr_name = "user.", - .attr_namelen = sizeof("user.") - 1, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_user_capable, -}; - -struct attrnames *attr_namespaces[] = - { &attr_system, &attr_trusted, &attr_secure, &attr_user }; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 981633f..fb3b2a6 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -18,9 +18,11 @@ #ifndef __XFS_ATTR_H__ #define __XFS_ATTR_H__ +struct xfs_inode; +struct xfs_da_args; +struct xfs_attr_list_context; + /* - * xfs_attr.h - * * Large attribute lists are structured around Btrees where all the data * elements are in the leaf nodes. Attribute names are hashed into an int, * then that int is used as the index into the Btree. Since the hashval @@ -35,35 +37,6 @@ * External interfaces *========================================================================*/ -struct cred; -struct bhv_vnode; - -typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int); -typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int); -typedef int (*attrremove_t)(struct bhv_vnode *, char *, int); -typedef int (*attrexists_t)(struct bhv_vnode *); -typedef int (*attrcapable_t)(struct bhv_vnode *, struct cred *); - -typedef struct attrnames { - char * attr_name; - unsigned int attr_namelen; - unsigned int attr_flag; - attrget_t attr_get; - attrset_t attr_set; - attrremove_t attr_remove; - attrexists_t attr_exists; - attrcapable_t attr_capable; -} attrnames_t; - -#define ATTR_NAMECOUNT 4 -extern struct attrnames attr_user; -extern struct attrnames attr_secure; -extern struct attrnames attr_system; -extern struct attrnames attr_trusted; -extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT]; - -extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int); -extern int attr_generic_list(struct bhv_vnode *, void *, size_t, int, ssize_t *); #define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ @@ -71,16 +44,9 @@ extern int attr_generic_list(struct bhv_vnode *, void *, size_t, int, ssize_t *) #define ATTR_SECURE 0x0008 /* use attrs in security namespace */ #define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ #define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ -#define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */ -#define ATTR_KERNACCESS 0x0400 /* [kernel] iaccess, inode held io-locked */ #define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ -#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */ - -#define ATTR_KERNORMALS 0x0800 /* [kernel] normal attr list: user+secure */ -#define ATTR_KERNROOTLS 0x8000 /* [kernel] include root in the attr list */ -#define ATTR_KERNFULLS (ATTR_KERNORMALS|ATTR_KERNROOTLS) /* * The maximum size (into the kernel or returned from the kernel) of an @@ -119,22 +85,6 @@ typedef struct attrlist_ent { /* data from attr_list() */ &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) /* - * Multi-attribute operation vector. - */ -typedef struct attr_multiop { - int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */ - int am_error; /* [out arg] result of this sub-op (an errno) */ - char *am_attrname; /* attribute name to work with */ - char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */ - int am_length; /* [in/out arg] length of value */ - int am_flags; /* bitwise OR of attr API flags defined above */ -} attr_multiop_t; - -#define ATTR_OP_GET 1 /* return the indicated attr's value */ -#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ -#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ - -/* * Kernel-internal version of the attrlist cursor. */ typedef struct attrlist_cursor_kern { @@ -148,25 +98,41 @@ typedef struct attrlist_cursor_kern { /*======================================================================== - * Function prototypes for the kernel. + * Structure used to pass context around among the routines. *========================================================================*/ -struct xfs_inode; -struct attrlist_cursor_kern; -struct xfs_da_args; + +typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, + char *, int, int, char *); + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + char *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + int put_value; /* T/F: need value for listent */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +} xfs_attr_list_context_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ /* * Overall external interface routines. */ -int xfs_attr_get(bhv_desc_t *, const char *, char *, int *, int, struct cred *); -int xfs_attr_set(bhv_desc_t *, const char *, char *, int, int, struct cred *); -int xfs_attr_remove(bhv_desc_t *, const char *, int, struct cred *); -int xfs_attr_list(bhv_desc_t *, char *, int, int, - struct attrlist_cursor_kern *, struct cred *); +int xfs_attr_calc_size(struct xfs_inode *, int, int, int *); int xfs_attr_inactive(struct xfs_inode *dp); - -int xfs_attr_shortform_getvalue(struct xfs_da_args *); -int xfs_attr_fetch(struct xfs_inode *, const char *, int, - char *, int *, int, struct cred *); +int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int); +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_list_int(struct xfs_attr_list_context *); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 9455051..79da6b2 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -89,9 +89,20 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, int dst_start, int move_count, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context, - attrnames_t *, char *name, int namelen, - int valuelen); + +/*======================================================================== + * Namespace helper routines + *========================================================================*/ + +/* + * If namespace bits don't match return 0. + * If all match then return 1. + */ +STATIC_INLINE int +xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +{ + return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); +} /*======================================================================== @@ -113,6 +124,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int offset; int minforkoff; /* lower limit on valid forkoff locations */ int maxforkoff; /* upper limit on valid forkoff locations */ + int dsize; xfs_mount_t *mp = dp->i_mount; offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ @@ -128,12 +140,47 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { if (bytes <= XFS_IFORK_ASIZE(dp)) - return mp->m_attroffset >> 3; + return dp->i_d.di_forkoff; return 0; } - /* data fork btree root can have at least this many key/ptr pairs */ - minforkoff = MAX(dp->i_df.if_bytes, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + dsize = dp->i_df.if_bytes; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* + * If there is no attr fork and the data fork is extents, + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for + * the btree root. + */ + if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset) + dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + break; + + case XFS_DINODE_FMT_BTREE: + /* + * If have data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have + * plenty of room for attrs + */ + if (dp->i_d.di_forkoff) { + if (offset < dp->i_d.di_forkoff) + return 0; + else + return dp->i_d.di_forkoff; + } else + dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + break; + } + + /* + * A data fork btree root must have space for at least + * MINDBTPTRS key/ptr pairs if the data fork is small or empty. + */ + minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ @@ -153,17 +200,15 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) STATIC void xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) { - unsigned long s; - if ((mp->m_flags & XFS_MOUNT_ATTR2) && - !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) { - s = XFS_SB_LOCK(mp); - if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { - XFS_SB_VERSION_ADDATTR2(&mp->m_sb); - XFS_SB_UNLOCK(mp, s); + !(xfs_sb_version_hasattr2(&mp->m_sb))) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr2(&mp->m_sb)) { + xfs_sb_version_addattr2(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); } else - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); } } @@ -228,11 +273,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; ASSERT(0); #endif @@ -246,12 +287,11 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : - ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); + sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; - be16_add(&sf->hdr.totsize, size); + be16_add_cpu(&sf->hdr.totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); @@ -282,11 +322,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) continue; if (memcmp(sfe->nameval, args->name, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; break; } @@ -301,14 +337,16 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) if (end != totsize) memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); sf->hdr.count--; - be16_add(&sf->hdr.totsize, -size); + be16_add_cpu(&sf->hdr.totsize, -size); /* * Fix up the start offset of the attribute fork */ totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname && - (mp->m_flags & XFS_MOUNT_ATTR2)) { + if (totsize == sizeof(xfs_attr_sf_hdr_t) && + !(args->op_flags & XFS_DA_OP_ADDNAME) && + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { /* * Last attribute now removed, revert to original * inode format making all literal area available @@ -326,8 +364,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_d.di_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname || - !(mp->m_flags & XFS_MOUNT_ATTR2)); + ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + (args->op_flags & XFS_DA_OP_ADDNAME) || + !(mp->m_flags & XFS_MOUNT_ATTR2) || + dp->i_d.di_format == XFS_DINODE_FMT_BTREE); dp->i_afp->if_ext_max = XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); dp->i_df.if_ext_max = @@ -363,11 +403,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; return(XFS_ERROR(EEXIST)); } @@ -394,11 +430,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; if (args->flags & ATTR_KERNOVAL) { args->valuelen = sfe->valuelen; @@ -475,7 +507,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; - nargs.oknoent = 1; + nargs.op_flags = XFS_DA_OP_OKNOENT; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; i++) { @@ -485,8 +517,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.valuelen = sfe->valuelen; nargs.hashval = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); - nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : - ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); error = xfs_attr_leaf_add(bp, &nargs); @@ -500,7 +531,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) out: if(bp) xfs_da_buf_done(bp); - kmem_free(tmpbuffer, size); + kmem_free(tmpbuffer); return(error); } @@ -520,6 +551,10 @@ xfs_attr_shortform_compare(const void *a, const void *b) } } + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) /* * Copy out entries of shortform attribute lists for attr_list(). * Shortform attribute lists are not stored in hashval sorted order. @@ -537,6 +572,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_sf_entry_t *sfe; xfs_inode_t *dp; int sbsize, nsbuf, count, i; + int error; ASSERT(context != NULL); dp = context->dp; @@ -552,46 +588,44 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_trace_l_c("sf start", context); /* - * If the buffer is large enough, do not bother with sorting. + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. */ - if ((dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize) { + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - attrnames_t *namesp; + error = context->put_listent(context, + sfe->flags, + (char *)sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + (char*)&sfe->nameval[sfe->namelen]); - if (((context->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - if (((context->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure: - ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - sfe->namelen + 1; - } - else { - if (xfs_attr_put_listent(context, namesp, - (char *)sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen)) - break; - } + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } xfs_attr_trace_l_c("sf big-gulp", context); return(0); } + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + /* * It didn't all fit, so we have to sort everything on hashval. */ @@ -611,21 +645,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe); xfs_attr_trace_l_c("sf corrupted", context); - kmem_free(sbuf, sbsize); + kmem_free(sbuf); return XFS_ERROR(EFSCORRUPTED); } - if (((context->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - if (((context->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } + sbp->entno = i; sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); sbp->name = (char *)sfe->nameval; @@ -660,7 +683,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } } if (i == nsbuf) { - kmem_free(sbuf, sbsize); + kmem_free(sbuf); xfs_attr_trace_l_c("blk end", context); return(0); } @@ -669,30 +692,24 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * Loop putting entries into the user buffer. */ for ( ; i < nsbuf; i++, sbp++) { - attrnames_t *namesp; - - namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure : - ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); - if (cursor->hashval != sbp->hash) { cursor->hashval = sbp->hash; cursor->offset = 0; } - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - sbp->namelen + 1; - } else { - if (xfs_attr_put_listent(context, namesp, - sbp->name, sbp->namelen, - sbp->valuelen)) - break; - } + error = context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; cursor->offset++; } - kmem_free(sbuf, sbsize); + kmem_free(sbuf); xfs_attr_trace_l_c("sf E-O-F", context); return(0); } @@ -729,6 +746,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return(-1); return(xfs_attr_shortform_bytesfit(dp, bytes)); @@ -767,6 +785,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); + ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); /* * Last attribute was removed, revert to original @@ -796,7 +815,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; - nargs.oknoent = 1; + nargs.op_flags = XFS_DA_OP_OKNOENT; entry = &leaf->entries[0]; for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) @@ -810,14 +829,13 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) nargs.value = (char *)&name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : - ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); xfs_attr_shortform_add(&nargs, forkoff); } error = 0; out: - kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount)); + kmem_free(tmpbuffer); return(error); } @@ -1077,7 +1095,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) xfs_da_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } - be16_add(&hdr->count, 1); + be16_add_cpu(&hdr->count, 1); /* * Allocate space for the new string (at the end of the run). @@ -1091,16 +1109,15 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) mp->m_sb.sb_blocksize, NULL)); ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); ASSERT((be16_to_cpu(map->size) & 0x3) == 0); - be16_add(&map->size, + be16_add_cpu(&map->size, -xfs_attr_leaf_newentsize(args->namelen, args->valuelen, mp->m_sb.sb_blocksize, &tmp)); entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) + be16_to_cpu(map->size)); entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : - ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); - if (args->rename) { + entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + if (args->op_flags & XFS_DA_OP_RENAME) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { @@ -1159,12 +1176,12 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) map = &hdr->freemap[0]; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { if (be16_to_cpu(map->base) == tmp) { - be16_add(&map->base, sizeof(xfs_attr_leaf_entry_t)); - be16_add(&map->size, + be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t)); + be16_add_cpu(&map->size, -((int)sizeof(xfs_attr_leaf_entry_t))); } } - be16_add(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); + be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); xfs_da_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); return(0); @@ -1216,7 +1233,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) be16_to_cpu(hdr_s->count), mp); xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); - kmem_free(tmpbuffer, XFS_LBSIZE(mp)); + kmem_free(tmpbuffer); } /* @@ -1672,9 +1689,9 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); if (be16_to_cpu(map->base) == tablesize) { - be16_add(&map->base, + be16_add_cpu(&map->base, -((int)sizeof(xfs_attr_leaf_entry_t))); - be16_add(&map->size, sizeof(xfs_attr_leaf_entry_t)); + be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t)); } if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) @@ -1696,19 +1713,19 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { map = &hdr->freemap[before]; - be16_add(&map->size, entsize); - be16_add(&map->size, + be16_add_cpu(&map->size, entsize); + be16_add_cpu(&map->size, be16_to_cpu(hdr->freemap[after].size)); hdr->freemap[after].base = 0; hdr->freemap[after].size = 0; } else if (before >= 0) { map = &hdr->freemap[before]; - be16_add(&map->size, entsize); + be16_add_cpu(&map->size, entsize); } else { map = &hdr->freemap[after]; /* both on-disk, don't endian flip twice */ map->base = entry->nameidx; - be16_add(&map->size, entsize); + be16_add_cpu(&map->size, entsize); } } else { /* @@ -1733,7 +1750,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) * Compress the remaining entries and zero out the removed stuff. */ memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize); - be16_add(&hdr->usedbytes, -entsize); + be16_add_cpu(&hdr->usedbytes, -entsize); xfs_da_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), entsize)); @@ -1741,7 +1758,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) tmp = (be16_to_cpu(hdr->count) - args->index) * sizeof(xfs_attr_leaf_entry_t); memmove((char *)entry, (char *)(entry+1), tmp); - be16_add(&hdr->count, -1); + be16_add_cpu(&hdr->count, -1); xfs_da_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); entry = &leaf->entries[be16_to_cpu(hdr->count)]; @@ -1866,7 +1883,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, be16_to_cpu(drop_hdr->count), mp); } memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); - kmem_free(tmpbuffer, state->blocksize); + kmem_free(tmpbuffer); } xfs_da_log_buf(state->args->trans, save_blk->bp, 0, @@ -1926,7 +1943,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) else break; } - ASSERT((probe >= 0) && + ASSERT((probe >= 0) && (!leaf->hdr.count || (probe < be16_to_cpu(leaf->hdr.count)))); ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); @@ -1971,14 +1988,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe); if (name_loc->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_loc->nameval, - args->namelen) != 0) - continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0)) + if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; return(XFS_ERROR(EEXIST)); @@ -1989,11 +2001,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) if (memcmp(args->name, (char *)name_rmt->name, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); @@ -2136,15 +2144,15 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); - be16_add(&hdr_s->usedbytes, -tmp); - be16_add(&hdr_s->count, -1); + be16_add_cpu(&hdr_s->usedbytes, -tmp); + be16_add_cpu(&hdr_s->count, -1); entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ - be16_add(&hdr_d->firstused, -tmp); + be16_add_cpu(&hdr_d->firstused, -tmp); /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; /* both on-disk, don't endian flip twice */ @@ -2157,10 +2165,10 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= XFS_LBSIZE(mp)); memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); - be16_add(&hdr_s->usedbytes, -tmp); - be16_add(&hdr_d->usedbytes, tmp); - be16_add(&hdr_s->count, -1); - be16_add(&hdr_d->count, 1); + be16_add_cpu(&hdr_s->usedbytes, -tmp); + be16_add_cpu(&hdr_d->usedbytes, tmp); + be16_add_cpu(&hdr_s->count, -1); + be16_add_cpu(&hdr_d->count, 1); tmp = be16_to_cpu(hdr_d->count) * sizeof(xfs_attr_leaf_entry_t) + sizeof(xfs_attr_leaf_hdr_t); @@ -2201,7 +2209,7 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * Fill in the freemap information */ hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - be16_add(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * + be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * sizeof(xfs_attr_leaf_entry_t)); hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - be16_to_cpu(hdr_d->freemap[0].base)); @@ -2312,8 +2320,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; int retval, i; ASSERT(bp != NULL); @@ -2355,10 +2361,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) * We have found our place, start copying out the new attributes. */ retval = 0; - for ( ; (i < be16_to_cpu(leaf->hdr.count)) - && (retval == 0); entry++, i++) { - attrnames_t *namesp; - + for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -2366,115 +2369,64 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* skip incomplete entries */ - if (((context->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) - continue; /* skip non-matching entries */ - if (((context->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) - continue; /* skip non-matching entries */ - - namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure : - ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - (int)name_loc->namelen + 1; - } else { - retval = xfs_attr_put_listent(context, namesp, - (char *)name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen)); - } + xfs_attr_leaf_name_local_t *name_loc = + XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + + retval = context->put_listent(context, + entry->flags, + (char *)name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + (char *)&name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - (int)name_rmt->namelen + 1; + xfs_attr_leaf_name_remote_t *name_rmt = + XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + entry->flags, + (char *)name_rmt->name, + (int)name_rmt->namelen, + valuelen, + (char*)args.value); + kmem_free(args.value); } else { - retval = xfs_attr_put_listent(context, namesp, - (char *)name_rmt->name, - (int)name_rmt->namelen, - be32_to_cpu(name_rmt->valuelen)); + retval = context->put_listent(context, + entry->flags, + (char *)name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); } + if (retval) + return retval; } - if (retval == 0) { - cursor->offset++; - } + if (context->seen_enough) + break; + cursor->offset++; } xfs_attr_trace_l_cl("blk end", context, leaf); return(retval); } -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ - & ~(sizeof(u_int32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -/*ARGSUSED*/ -STATIC int -xfs_attr_put_listent(xfs_attr_list_context_t *context, - attrnames_t *namesp, char *name, int namelen, int valuelen) -{ - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!(context->flags & ATTR_KERNOVAL)); - if (context->flags & ATTR_KERNAMELS) { - char *offset; - - ASSERT(context->count >= 0); - - arraytop = context->count + namesp->attr_namelen + namelen + 1; - if (arraytop > context->firstu) { - context->count = -1; /* insufficient space */ - return(1); - } - offset = (char *)context->alist + context->count; - strncpy(offset, namesp->attr_name, namesp->attr_namelen); - offset += namesp->attr_namelen; - strncpy(offset, name, namelen); /* real name */ - offset += namelen; - *offset = '\0'; - context->count += namesp->attr_namelen + namelen + 1; - return(0); - } - - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*context->alist)); - ASSERT(context->firstu <= context->bufsize); - - arraytop = sizeof(*context->alist) + - context->count * sizeof(context->alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - xfs_attr_trace_l_c("buffer full", context); - context->alist->al_more = 1; - return(1); - } - - aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]); - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[ namelen ] = 0; - context->alist->al_offset[ context->count++ ] = context->firstu; - context->alist->al_count = context->count; - xfs_attr_trace_l_c("add", context); - return(0); -} /*======================================================================== * Manage the INCOMPLETE flag in a leaf entry @@ -2546,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); - - return(error); + return xfs_trans_roll(&args->trans, args->dp); } /* @@ -2595,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); - - return(error); + return xfs_trans_roll(&args->trans, args->dp); } /* @@ -2713,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); + error = xfs_trans_roll(&args->trans, args->dp); return(error); } @@ -2771,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) /* * Commit the invalidate and start the next transaction. */ - error = xfs_attr_rolltrans(trans, dp); + error = xfs_trans_roll(trans, dp); return (error); } @@ -2873,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, /* * Atomically commit the whole invalidate stuff. */ - if ((error = xfs_attr_rolltrans(trans, dp))) + error = xfs_trans_roll(trans, dp); + if (error) return (error); } @@ -2957,7 +2906,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) error = tmp; /* save only the 1st errno */ } - kmem_free((xfs_caddr_t)list, size); + kmem_free((xfs_caddr_t)list); return(error); } @@ -3012,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, /* * Roll to next transaction. */ - if ((error = xfs_attr_rolltrans(trans, dp))) + error = xfs_trans_roll(trans, dp); + if (error) return (error); } @@ -3022,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, return(0); } - - -/* - * Roll from one trans in the sequence of PERMANENT transactions to the next. - */ -int -xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp) -{ - xfs_trans_t *trans; - unsigned int logres, count; - int error; - - /* - * Ensure that the inode is always logged. - */ - trans = *transp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - - /* - * Copy the critical parameters from one trans to the next. - */ - logres = trans->t_log_res; - count = trans->t_log_count; - *transp = xfs_trans_dup(trans); - - /* - * Commit the current transaction. - * If this commit failed, then it'd just unlock those items that - * are not marked ihold. That also means that a filesystem shutdown - * is in progress. The caller takes the responsibility to cancel - * the duplicate transaction that gets returned. - */ - if ((error = xfs_trans_commit(trans, 0, NULL))) - return (error); - - trans = *transp; - - /* - * Reserve space in the log for th next transaction. - * This also pushes items in the "AIL", the list of logged items, - * out to disk if they are taking up space at the tail of the log - * that we want to use. This requires that either nothing be locked - * across this call, or that anything that is locked be logged in - * the prior and the next transactions. - */ - error = xfs_trans_reserve(trans, 0, logres, 0, - XFS_TRANS_PERM_LOG_RES, count); - /* - * Ensure that the inode is in the new transaction and locked. - */ - if (!error) { - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); - } - return (error); - -} diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 51c3ee1..83e9af4 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -30,7 +30,7 @@ struct attrlist; struct attrlist_cursor_kern; -struct attrnames; +struct xfs_attr_list_context; struct xfs_dabuf; struct xfs_da_args; struct xfs_da_state; @@ -130,6 +130,19 @@ typedef struct xfs_attr_leafblock { #define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) /* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* * Alignment for namelist and valuelist entries (since they are mixed * there can be only one alignment value) */ @@ -191,23 +204,6 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize) return (((bsize) >> 1) + ((bsize) >> 2)); } - -/*======================================================================== - * Structure used to pass context around among the routines. - *========================================================================*/ - -typedef struct xfs_attr_list_context { - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor;/* position in list */ - struct attrlist *alist; /* output buffer */ - int count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize;/* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch;/* T/F: resynch with cursor */ -} xfs_attr_list_context_t; - /* * Used to keep a list of "remote value" extents when unlinking an inode. */ @@ -278,6 +274,4 @@ int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, struct xfs_dabuf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); -int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp); - #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h index f67f917..ea22839 100644 --- a/fs/xfs/xfs_attr_sf.h +++ b/fs/xfs/xfs_attr_sf.h @@ -97,13 +97,9 @@ void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, struct xfs_attr_leafblock *leaf); void xfs_attr_trace_enter(int type, char *where, - __psunsigned_t a2, __psunsigned_t a3, - __psunsigned_t a4, __psunsigned_t a5, - __psunsigned_t a6, __psunsigned_t a7, - __psunsigned_t a8, __psunsigned_t a9, - __psunsigned_t a10, __psunsigned_t a11, - __psunsigned_t a12, __psunsigned_t a13, - __psunsigned_t a14, __psunsigned_t a15); + struct xfs_attr_list_context *context, + __psunsigned_t a13, __psunsigned_t a14, + __psunsigned_t a15); #else #define xfs_attr_trace_l_c(w,c) #define xfs_attr_trace_l_cn(w,c,n) diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c deleted file mode 100644 index f4fe371..0000000 --- a/fs/xfs/xfs_behavior.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" - -/* - * Source file used to associate/disassociate behaviors with virtualized - * objects. See xfs_behavior.h for more information about behaviors, etc. - * - * The implementation is split between functions in this file and macros - * in xfs_behavior.h. - */ - -/* - * Insert a new behavior descriptor into a behavior chain. - * - * The behavior chain is ordered based on the 'position' number which - * lives in the first field of the ops vector (higher numbers first). - * - * Attempts to insert duplicate ops result in an EINVAL return code. - * Otherwise, return 0 to indicate success. - */ -int -bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp) -{ - bhv_desc_t *curdesc, *prev; - int position; - - /* - * Validate the position value of the new behavior. - */ - position = BHV_POSITION(bdp); - ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP); - - /* - * Find location to insert behavior. Check for duplicates. - */ - prev = NULL; - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - /* Check for duplication. */ - if (curdesc->bd_ops == bdp->bd_ops) { - ASSERT(0); - return EINVAL; - } - - /* Find correct position */ - if (position >= BHV_POSITION(curdesc)) { - ASSERT(position != BHV_POSITION(curdesc)); - break; /* found it */ - } - - prev = curdesc; - } - - if (prev == NULL) { - /* insert at front of chain */ - bdp->bd_next = bhp->bh_first; - bhp->bh_first = bdp; - } else { - /* insert after prev */ - bdp->bd_next = prev->bd_next; - prev->bd_next = bdp; - } - - return 0; -} - -/* - * Remove a behavior descriptor from a position in a behavior chain; - * the position is guaranteed not to be the first position. - * Should only be called by the bhv_remove() macro. - */ -void -bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp) -{ - bhv_desc_t *curdesc, *prev; - - ASSERT(bhp->bh_first != NULL); - ASSERT(bhp->bh_first->bd_next != NULL); - - prev = bhp->bh_first; - for (curdesc = bhp->bh_first->bd_next; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc == bdp) - break; /* found it */ - prev = curdesc; - } - - ASSERT(curdesc == bdp); - prev->bd_next = bdp->bd_next; /* remove from after prev */ -} - -/* - * Look for a specific ops vector on the specified behavior chain. - * Return the associated behavior descriptor. Or NULL, if not found. - */ -bhv_desc_t * -bhv_lookup(bhv_head_t *bhp, void *ops) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc->bd_ops == ops) - return curdesc; - } - - return NULL; -} - -/* - * Looks for the first behavior within a specified range of positions. - * Return the associated behavior descriptor. Or NULL, if none found. - */ -bhv_desc_t * -bhv_lookup_range(bhv_head_t *bhp, int low, int high) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - int position = BHV_POSITION(curdesc); - - if (position <= high) { - if (position >= low) - return curdesc; - return NULL; - } - } - - return NULL; -} - -/* - * Return the base behavior in the chain, or NULL if the chain - * is empty. - * - * The caller has not read locked the behavior chain, so acquire the - * lock before traversing the chain. - */ -bhv_desc_t * -bhv_base(bhv_head_t *bhp) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc->bd_next == NULL) { - return curdesc; - } - } - - return NULL; -} - -void -bhv_head_init( - bhv_head_t *bhp, - char *name) -{ - bhp->bh_first = NULL; -} - -void -bhv_insert_initial( - bhv_head_t *bhp, - bhv_desc_t *bdp) -{ - ASSERT(bhp->bh_first == NULL); - (bhp)->bh_first = bdp; -} - -void -bhv_head_destroy( - bhv_head_t *bhp) -{ - ASSERT(bhp->bh_first == NULL); -} diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h deleted file mode 100644 index 6e6e56f..0000000 --- a/fs/xfs/xfs_behavior.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_BEHAVIOR_H__ -#define __XFS_BEHAVIOR_H__ - -/* - * Header file used to associate behaviors with virtualized objects. - * - * A virtualized object is an internal, virtualized representation of - * OS entities such as persistent files, processes, or sockets. Examples - * of virtualized objects include vnodes, vprocs, and vsockets. Often - * a virtualized object is referred to simply as an "object." - * - * A behavior is essentially an implementation layer associated with - * an object. Multiple behaviors for an object are chained together, - * the order of chaining determining the order of invocation. Each - * behavior of a given object implements the same set of interfaces - * (e.g., the VOP interfaces). - * - * Behaviors may be dynamically inserted into an object's behavior chain, - * such that the addition is transparent to consumers that already have - * references to the object. Typically, a given behavior will be inserted - * at a particular location in the behavior chain. Insertion of new - * behaviors is synchronized with operations-in-progress (oip's) so that - * the oip's always see a consistent view of the chain. - * - * The term "interposition" is used to refer to the act of inserting - * a behavior such that it interposes on (i.e., is inserted in front - * of) a particular other behavior. A key example of this is when a - * system implementing distributed single system image wishes to - * interpose a distribution layer (providing distributed coherency) - * in front of an object that is otherwise only accessed locally. - * - * Note that the traditional vnode/inode combination is simply a virtualized - * object that has exactly one associated behavior. - * - * Behavior synchronization is logic which is necessary under certain - * circumstances that there is no conflict between ongoing operations - * traversing the behavior chain and those dynamically modifying the - * behavior chain. Because behavior synchronization adds extra overhead - * to virtual operation invocation, we want to restrict, as much as - * we can, the requirement for this extra code, to those situations - * in which it is truly necessary. - * - * Behavior synchronization is needed whenever there's at least one class - * of object in the system for which: - * 1) multiple behaviors for a given object are supported, - * -- AND -- - * 2a) insertion of a new behavior can happen dynamically at any time during - * the life of an active object, - * -- AND -- - * 3a) insertion of a new behavior needs to synchronize with existing - * ops-in-progress. - * -- OR -- - * 3b) multiple different behaviors can be dynamically inserted at - * any time during the life of an active object - * -- OR -- - * 3c) removal of a behavior can occur at any time during the life of - * an active object. - * -- OR -- - * 2b) removal of a behavior can occur at any time during the life of an - * active object - * - */ - -/* - * Behavior head. Head of the chain of behaviors. - * Contained within each virtualized object data structure. - */ -typedef struct bhv_head { - struct bhv_desc *bh_first; /* first behavior in chain */ -} bhv_head_t; - -/* - * Behavior descriptor. Descriptor associated with each behavior. - * Contained within the behavior's private data structure. - */ -typedef struct bhv_desc { - void *bd_pdata; /* private data for this behavior */ - void *bd_vobj; /* virtual object associated with */ - void *bd_ops; /* ops for this behavior */ - struct bhv_desc *bd_next; /* next behavior in chain */ -} bhv_desc_t; - -/* - * Behavior identity field. A behavior's identity determines the position - * where it lives within a behavior chain, and it's always the first field - * of the behavior's ops vector. The optional id field further identifies the - * subsystem responsible for the behavior. - */ -typedef struct bhv_identity { - __u16 bi_id; /* owning subsystem id */ - __u16 bi_position; /* position in chain */ -} bhv_identity_t; - -typedef bhv_identity_t bhv_position_t; - -#define BHV_IDENTITY_INIT(id,pos) {id, pos} -#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos) - -/* - * Define boundaries of position values. - */ -#define BHV_POSITION_INVALID 0 /* invalid position number */ -#define BHV_POSITION_BASE 1 /* base (last) implementation layer */ -#define BHV_POSITION_TOP 63 /* top (first) implementation layer */ - -/* - * Plumbing macros. - */ -#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first) -#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next) -#define BHV_NEXTNULL(bdp) ((bdp)->bd_next) -#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj) -#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj) -#define BHV_PDATA(bdp) (bdp)->bd_pdata -#define BHV_OPS(bdp) (bdp)->bd_ops -#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops) -#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position) - -extern void bhv_head_init(bhv_head_t *, char *); -extern void bhv_head_destroy(bhv_head_t *); -extern int bhv_insert(bhv_head_t *, bhv_desc_t *); -extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *); - -/* - * Initialize a new behavior descriptor. - * Arguments: - * bdp - pointer to behavior descriptor - * pdata - pointer to behavior's private data - * vobj - pointer to associated virtual object - * ops - pointer to ops for this behavior - */ -#define bhv_desc_init(bdp, pdata, vobj, ops) \ - { \ - (bdp)->bd_pdata = pdata; \ - (bdp)->bd_vobj = vobj; \ - (bdp)->bd_ops = ops; \ - (bdp)->bd_next = NULL; \ - } - -/* - * Remove a behavior descriptor from a behavior chain. - */ -#define bhv_remove(bhp, bdp) \ - { \ - if ((bhp)->bh_first == (bdp)) { \ - /* \ - * Remove from front of chain. \ - * Atomic wrt oip's. \ - */ \ - (bhp)->bh_first = (bdp)->bd_next; \ - } else { \ - /* remove from non-front of chain */ \ - bhv_remove_not_first(bhp, bdp); \ - } \ - (bdp)->bd_vobj = NULL; \ - } - -/* - * Behavior module prototypes. - */ -extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp); -extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops); -extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high); -extern bhv_desc_t * bhv_base(bhv_head_t *bhp); - -/* No bhv locking on Linux */ -#define bhv_lookup_unlocked bhv_lookup -#define bhv_base_unlocked bhv_base - -#endif /* __XFS_BEHAVIOR_H__ */ diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index 43be6a7..4822884 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -25,198 +25,22 @@ * XFS bit manipulation routines, used in non-realtime code. */ -#ifndef HAVE_ARCH_HIGHBIT /* - * Index of high bit number in byte, -1 for none set, 0..7 otherwise. - */ -STATIC const char xfs_highbit[256] = { - -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */ - 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */ - 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */ - 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */ - 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */ - 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */ - 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */ - 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */ - 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */ -}; -#endif - -/* - * Count of bits set in byte, 0..8. - */ -static const char xfs_countbit[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */ - 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */ - 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */ - 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */ - 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */ - 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */ - 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */ -}; - -/* - * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set. - */ -inline int -xfs_highbit32( - __uint32_t v) -{ -#ifdef HAVE_ARCH_HIGHBIT - return highbit32(v); -#else - int i; - - if (v & 0xffff0000) - if (v & 0xff000000) - i = 24; - else - i = 16; - else if (v & 0x0000ffff) - if (v & 0x0000ff00) - i = 8; - else - i = 0; - else - return -1; - return i + xfs_highbit[(v >> i) & 0xff]; -#endif -} - -/* - * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set. - */ -int -xfs_lowbit64( - __uint64_t v) -{ - __uint32_t w = (__uint32_t)v; - int n = 0; - - if (w) { /* lower bits */ - n = ffs(w); - } else { /* upper bits */ - w = (__uint32_t)(v >> 32); - if (w && (n = ffs(w))) - n += 32; - } - return n - 1; -} - -/* - * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set. - */ -int -xfs_highbit64( - __uint64_t v) -{ - __uint32_t h = (__uint32_t)(v >> 32); - - if (h) - return xfs_highbit32(h) + 32; - return xfs_highbit32((__uint32_t)v); -} - - -/* - * Count the number of bits set in the bitmap starting with bit - * start_bit. Size is the size of the bitmap in words. - * - * Do the counting by mapping a byte value to the number of set - * bits for that value using the xfs_countbit array, i.e. - * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1, - * xfs_countbit[3] == 2, etc. + * Return whether bitmap is empty. + * Size is number of words in the bitmap, which is padded to word boundary + * Returns 1 for empty, 0 for non-empty. */ int -xfs_count_bits(uint *map, uint size, uint start_bit) +xfs_bitmap_empty(uint *map, uint size) { - register int bits; - register unsigned char *bytep; - register unsigned char *end_map; - int byte_bit; - - bits = 0; - end_map = (char*)(map + size); - bytep = (char*)(map + (start_bit & ~0x7)); - byte_bit = start_bit & 0x7; - - /* - * If the caller fell off the end of the map, return 0. - */ - if (bytep >= end_map) { - return (0); - } - - /* - * If start_bit is not byte aligned, then process the - * first byte separately. - */ - if (byte_bit != 0) { - /* - * Shift off the bits we don't want to look at, - * before indexing into xfs_countbit. - */ - bits += xfs_countbit[(*bytep >> byte_bit)]; - bytep++; - } + uint i; + uint ret = 0; - /* - * Count the bits in each byte until the end of the bitmap. - */ - while (bytep < end_map) { - bits += xfs_countbit[*bytep]; - bytep++; + for (i = 0; i < size; i++) { + ret |= map[i]; } - return (bits); + return (ret == 0); } /* diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index 0bbe568..8e0e463 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -47,16 +47,42 @@ static inline __uint64_t xfs_mask64lo(int n) } /* Get high bit set out of 32-bit argument, -1 if none set */ -extern int xfs_highbit32(__uint32_t v); +static inline int xfs_highbit32(__uint32_t v) +{ + return fls(v) - 1; +} + +/* Get high bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_highbit64(__uint64_t v) +{ + return fls64(v) - 1; +} + +/* Get low bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_lowbit32(__uint32_t v) +{ + unsigned long t = v; + return (v) ? find_first_bit(&t, 32) : -1; +} /* Get low bit set out of 64-bit argument, -1 if none set */ -extern int xfs_lowbit64(__uint64_t v); +static inline int xfs_lowbit64(__uint64_t v) +{ + __uint32_t w = (__uint32_t)v; + int n = 0; -/* Get high bit set out of 64-bit argument, -1 if none set */ -extern int xfs_highbit64(__uint64_t); + if (w) { /* lower bits */ + n = ffs(w); + } else { /* upper bits */ + w = (__uint32_t)(v >> 32); + if (w && (n = ffs(w))) + n += 32; + } + return n - 1; +} -/* Count set bits in map starting with start_bit */ -extern int xfs_count_bits(uint *map, uint size, uint start_bit); +/* Return whether bitmap is empty (1 == empty) */ +extern int xfs_bitmap_empty(uint *map, uint size); /* Count continuous one bits in map starting with start_bit */ extern int xfs_contig_bits(uint *map, uint size, uint start_bit); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index bf46fae..a1aab92 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -52,6 +52,8 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" #ifdef DEBUG @@ -130,7 +132,6 @@ STATIC int /* error */ xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp,/* inode logging flags */ xfs_extdelta_t *delta, /* Change made to incore extents */ @@ -185,16 +186,6 @@ xfs_bmap_btree_to_extents( int *logflagsp, /* inode logging flags */ int whichfork); /* data or attr fork */ -#ifdef DEBUG -/* - * Check that the extents list for the inode ip is in the right order. - */ -STATIC void -xfs_bmap_check_extents( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork); /* data or attr fork */ -#endif - /* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). @@ -258,7 +249,7 @@ xfs_bmap_local_to_extents( * Else, *lastxp will be set to the index of the found * entry; *gotp will contain the entry. */ -STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_extents( xfs_inode_t *ip, /* incore inode pointer */ xfs_fileoff_t bno, /* block number searched for */ @@ -283,26 +274,11 @@ xfs_bmap_isaeof( #ifdef XFS_BMAP_TRACE /* - * Add a bmap trace buffer entry. Base routine for the others. - */ -STATIC void -xfs_bmap_trace_addentry( - int opcode, /* operation */ - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(ies) */ - xfs_extnum_t cnt, /* count of entries, 1 or 2 */ - xfs_bmbt_rec_t *r1, /* first record */ - xfs_bmbt_rec_t *r2, /* second record or null */ - int whichfork); /* data or attr fork */ - -/* * Add bmap trace entry prior to a call to xfs_iext_remove. */ STATIC void xfs_bmap_trace_delete( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) deleted */ @@ -315,7 +291,7 @@ xfs_bmap_trace_delete( */ STATIC void xfs_bmap_trace_insert( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) inserted */ @@ -329,7 +305,7 @@ xfs_bmap_trace_insert( */ STATIC void xfs_bmap_trace_post_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry updated */ @@ -340,17 +316,25 @@ xfs_bmap_trace_post_update( */ STATIC void xfs_bmap_trace_pre_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry to be updated */ int whichfork); /* data or attr fork */ +#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \ + xfs_bmap_trace_delete(__func__,d,ip,i,c,w) +#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \ + xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w) +#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \ + xfs_bmap_trace_post_update(__func__,d,ip,i,w) +#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \ + xfs_bmap_trace_pre_update(__func__,d,ip,i,w) #else -#define xfs_bmap_trace_delete(f,d,ip,i,c,w) -#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w) -#define xfs_bmap_trace_post_update(f,d,ip,i,w) -#define xfs_bmap_trace_pre_update(f,d,ip,i,w) +#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) +#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) +#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) +#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) #endif /* XFS_BMAP_TRACE */ /* @@ -400,17 +384,15 @@ xfs_bmap_count_tree( int levelin, int *count); -STATIC int +STATIC void xfs_bmap_count_leaves( xfs_ifork_t *ifp, xfs_extnum_t idx, int numrecs, int *count); -STATIC int +STATIC void xfs_bmap_disk_count_leaves( - xfs_ifork_t *ifp, - xfs_mount_t *mp, xfs_extnum_t idx, xfs_bmbt_block_t *block, int numrecs, @@ -446,7 +428,8 @@ xfs_bmap_add_attrfork_btree( cur->bc_private.b.firstblock = *firstblock; if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; - ASSERT(stat == 1); /* must be at least one entry */ + /* must be at least one entry */ + XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); if ((error = xfs_bmbt_newroot(cur, flags, &stat))) goto error0; if (stat == 0) { @@ -544,9 +527,6 @@ xfs_bmap_add_extent( xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent"; -#endif xfs_ifork_t *ifp; /* inode fork ptr */ int logflags; /* returned value */ xfs_extnum_t nextents; /* number of extents in file now */ @@ -564,8 +544,8 @@ xfs_bmap_add_extent( * already extents in the list. */ if (nextents == 0) { - xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new, - NULL, whichfork); + XFS_BMAP_TRACE_INSERT("insert empty", ip, 0, 1, new, NULL, + whichfork); xfs_iext_insert(ifp, 0, 1, new); ASSERT(cur == NULL); ifp->if_lastex = 0; @@ -591,7 +571,7 @@ xfs_bmap_add_extent( if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); - if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new, + if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, &logflags, delta, rsvd))) goto done; } @@ -684,7 +664,7 @@ xfs_bmap_add_extent( ASSERT(nblks <= da_old); if (nblks < da_old) xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, - (int)(da_old - nblks), rsvd); + (int64_t)(da_old - nblks), rsvd); } /* * Clear out the allocated field, done with it now in any case. @@ -721,11 +701,8 @@ xfs_bmap_add_extent_delay_real( { xfs_btree_cur_t *cur; /* btree cursor */ int diff; /* temp value */ - xfs_bmbt_rec_t *ep; /* extent entry for idx */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_delay_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ @@ -821,15 +798,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 2); ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents--; @@ -841,13 +817,13 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -868,15 +844,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; - xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 1); if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -886,7 +861,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startblock, LEFT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -905,16 +880,13 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; - xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx + 1, 1); if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -924,7 +896,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + @@ -944,11 +916,9 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startblock(ep, new->br_startblock); - xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) @@ -959,11 +929,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } *dnew = 0; /* DELTA: The in-core extent described by new changed type. */ @@ -976,17 +946,14 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); ip->i_df.if_lastex = idx - 1; if (cur == NULL) @@ -997,7 +964,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startblock, LEFT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1008,8 +975,7 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), STARTBLOCKVAL(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ temp = LEFT.br_startoff; @@ -1022,11 +988,11 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, XFS_DATA_FORK); xfs_iext_insert(ifp, idx, 1, new); ip->i_df.if_lastex = idx; @@ -1039,11 +1005,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { @@ -1059,8 +1025,7 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx + 1); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1073,17 +1038,14 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx + 1; if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -1093,7 +1055,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -1104,8 +1066,7 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), STARTBLOCKVAL(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ temp = PREV.br_startoff; @@ -1119,10 +1080,10 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, - new, NULL, XFS_DATA_FORK); + XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, + XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 1, new); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; @@ -1134,11 +1095,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { @@ -1154,7 +1115,7 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1168,13 +1129,15 @@ xfs_bmap_add_extent_delay_real( * This case is avoided almost all the time. */ temp = new->br_startoff - PREV.br_startoff; - xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); r[0] = *new; + r[1].br_state = PREV.br_state; + r[1].br_startblock = 0; r[1].br_startoff = new_endoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_blockcount = temp2; - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 2, &r[0]); ip->i_df.if_lastex = idx + 1; @@ -1187,11 +1150,11 @@ xfs_bmap_add_extent_delay_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { @@ -1207,7 +1170,7 @@ xfs_bmap_add_extent_delay_real( diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); if (diff > 0 && - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) { + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { /* * Ick gross gag me with a spoon. */ @@ -1218,7 +1181,7 @@ xfs_bmap_add_extent_delay_real( diff--; if (!diff || !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -diff, rsvd)) + XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) break; } if (temp2) { @@ -1226,20 +1189,18 @@ xfs_bmap_add_extent_delay_real( diff--; if (!diff || !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -diff, rsvd)) + XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) break; } } } ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), NULLSTARTBLOCK((int)temp2)); - xfs_bmap_trace_post_update(fname, "0", ip, idx + 2, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); *dnew = temp + temp2; /* DELTA: One in-core extent is split in three. */ temp = PREV.br_startoff; @@ -1296,11 +1257,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_extdelta_t *delta) /* Change made to incore extents */ { xfs_btree_cur_t *cur; /* btree cursor */ - xfs_bmbt_rec_t *ep; /* extent entry for idx */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_unwritten_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ @@ -1401,15 +1359,14 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 2); ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents -= 2; @@ -1421,19 +1378,19 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + @@ -1452,15 +1409,14 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; - xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 1); ip->i_d.di_nextents--; if (cur == NULL) @@ -1471,13 +1427,13 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount, @@ -1495,16 +1451,15 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); xfs_bmbt_set_state(ep, newext); - xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; - xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx + 1, 1); ip->i_d.di_nextents--; if (cur == NULL) @@ -1515,13 +1470,13 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, @@ -1540,10 +1495,10 @@ xfs_bmap_add_extent_unwritten_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_state(ep, newext); - xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; if (cur == NULL) @@ -1554,7 +1509,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount, newext))) @@ -1570,21 +1525,21 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; if (cur == NULL) @@ -1595,7 +1550,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -1621,15 +1576,15 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); xfs_bmbt_set_startoff(ep, new_endoff); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, XFS_DATA_FORK); xfs_iext_insert(ifp, idx, 1, new); ip->i_df.if_lastex = idx; @@ -1642,7 +1597,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -1652,7 +1607,7 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = *new; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1664,18 +1619,18 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, newext); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx + 1; if (cur == NULL) @@ -1686,7 +1641,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -1711,12 +1666,12 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, - new, NULL, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, + XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 1, new); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; @@ -1728,7 +1683,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -1738,11 +1693,11 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1755,17 +1710,17 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, new->br_startoff - PREV.br_startoff); - xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); r[0] = *new; r[1].br_startoff = new_endoff; r[1].br_blockcount = PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; r[1].br_state = oldext; - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 2, &r[0]); ip->i_df.if_lastex = idx + 1; @@ -1778,27 +1733,34 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); /* new right extent - oldext */ if ((error = xfs_bmbt_update(cur, r[1].br_startoff, r[1].br_startblock, r[1].br_blockcount, r[1].br_state))) goto done; /* new left extent - oldext */ - PREV.br_blockcount = - new->br_startoff - PREV.br_startoff; cur->bc_rec.b = PREV; + cur->bc_rec.b.br_blockcount = + new->br_startoff - PREV.br_startoff; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_increment(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); /* new middle extent - newext */ - cur->bc_rec.b = *new; + cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: One in-core extent is split in three. */ temp = PREV.br_startoff; @@ -1850,16 +1812,12 @@ STATIC int /* error */ xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd) /* OK to allocate reserved blocks */ { - xfs_bmbt_rec_t *ep; /* extent record for idx */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_hole_delay"; -#endif + xfs_bmbt_rec_host_t *ep; /* extent record for idx */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_filblks_t newlen=0; /* new indirect size */ @@ -1931,7 +1889,7 @@ xfs_bmap_add_extent_hole_delay( */ temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = STARTBLOCKVAL(left.br_startblock) + @@ -1940,10 +1898,9 @@ xfs_bmap_add_extent_hole_delay( newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), NULLSTARTBLOCK((int)newlen)); - xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1, + XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 1); ip->i_df.if_lastex = idx - 1; /* DELTA: Two in-core extents were replaced by one. */ @@ -1958,7 +1915,7 @@ xfs_bmap_add_extent_hole_delay( * Merge the new allocation with the left neighbor. */ temp = left.br_blockcount + new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = STARTBLOCKVAL(left.br_startblock) + @@ -1966,7 +1923,7 @@ xfs_bmap_add_extent_hole_delay( newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), NULLSTARTBLOCK((int)newlen)); - xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; /* DELTA: One in-core extent grew into a hole. */ @@ -1980,14 +1937,14 @@ xfs_bmap_add_extent_hole_delay( * on the right. * Merge the new allocation with the right neighbor. */ - xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); temp = new->br_blockcount + right.br_blockcount; oldlen = STARTBLOCKVAL(new->br_startblock) + STARTBLOCKVAL(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_allf(ep, new->br_startoff, NULLSTARTBLOCK((int)newlen), temp, right.br_state); - xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; /* DELTA: One in-core extent grew into a hole. */ temp2 = temp; @@ -2001,7 +1958,7 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, + XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, XFS_DATA_FORK); xfs_iext_insert(ifp, idx, 1, new); ip->i_df.if_lastex = idx; @@ -2013,7 +1970,7 @@ xfs_bmap_add_extent_hole_delay( if (oldlen != newlen) { ASSERT(oldlen > newlen); xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, - (int)(oldlen - newlen), rsvd); + (int64_t)(oldlen - newlen), rsvd); /* * Nothing to do for disk quota accounting here. */ @@ -2049,11 +2006,8 @@ xfs_bmap_add_extent_hole_real( xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */ + xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_hole_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ @@ -2130,15 +2084,14 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, whichfork); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount + right.br_blockcount); - xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, whichfork); - xfs_bmap_trace_delete(fname, "LC|RC", ip, - idx, 1, whichfork); + XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, whichfork); xfs_iext_remove(ifp, idx, 1); ifp->if_lastex = idx - 1; XFS_IFORK_NEXT_SET(ip, whichfork, @@ -2152,13 +2105,13 @@ xfs_bmap_add_extent_hole_real( right.br_startblock, right.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -2180,10 +2133,10 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, whichfork); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); ifp->if_lastex = idx - 1; if (cur == NULL) { rval = XFS_ILOG_FEXT(whichfork); @@ -2194,7 +2147,7 @@ xfs_bmap_add_extent_hole_real( left.br_startblock, left.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -2214,11 +2167,11 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, whichfork); xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); ifp->if_lastex = idx; if (cur == NULL) { rval = XFS_ILOG_FEXT(whichfork); @@ -2229,7 +2182,7 @@ xfs_bmap_add_extent_hole_real( right.br_startblock, right.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -2249,8 +2202,7 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, - whichfork); + XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, whichfork); xfs_iext_insert(ifp, idx, 1, new); ifp->if_lastex = idx; XFS_IFORK_NEXT_SET(ip, whichfork, @@ -2264,11 +2216,11 @@ xfs_bmap_add_extent_hole_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_bmbt_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } /* DELTA: A new extent was added in a hole. */ temp = new->br_startoff; @@ -2458,7 +2410,7 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -STATIC int +STATIC void xfs_bmap_adjacent( xfs_bmalloca_t *ap) /* bmap alloc argument struct */ { @@ -2604,7 +2556,6 @@ xfs_bmap_adjacent( ap->rval = gotbno; } #undef ISVALID - return 0; } STATIC int @@ -2617,12 +2568,10 @@ xfs_bmap_rtalloc( xfs_extlen_t prod = 0; /* product factor for allocators */ xfs_extlen_t ralen = 0; /* realtime allocation length */ xfs_extlen_t align; /* minimum allocation alignment */ - xfs_rtblock_t rtx; /* realtime extent number */ xfs_rtblock_t rtb; mp = ap->ip->i_mount; - align = ap->ip->i_d.di_extsize ? - ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize; + align = xfs_get_extsz_hint(ap->ip); prod = align / mp->m_sb.sb_rextsize; error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, align, 1, ap->eof, 0, @@ -2656,6 +2605,8 @@ xfs_bmap_rtalloc( * pick an extent that will space things out in the rt area. */ if (ap->eof && ap->off == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; @@ -2727,9 +2678,7 @@ xfs_bmap_btalloc( int error; mp = ap->ip->i_mount; - align = (ap->userdata && ap->ip->i_d.di_extsize && - (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ? - ap->ip->i_d.di_extsize : 0; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, align, 0, ap->eof, 0, ap->conv, @@ -2739,9 +2688,15 @@ xfs_bmap_btalloc( } nullfb = ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); - if (nullfb) - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); - else + if (nullfb) { + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + ag = xfs_filestream_lookup_ag(ap->ip); + ag = (ag != NULLAGNUMBER) ? ag : 0; + ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); + } else { + ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + } + } else ap->rval = ap->firstblock; xfs_bmap_adjacent(ap); @@ -2765,13 +2720,22 @@ xfs_bmap_btalloc( args.firstblock = ap->firstblock; blen = 0; if (nullfb) { - args.type = XFS_ALLOCTYPE_START_BNO; + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_NEAR_BNO; + else + args.type = XFS_ALLOCTYPE_START_BNO; args.total = ap->total; + /* - * Find the longest available space. - * We're going to try for the whole allocation at once. + * Search for an allocation group with a single extent + * large enough for the request. + * + * If one isn't found, then adjust the minimum allocation + * size to the largest space found. */ startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; notinit = 0; down_read(&mp->m_peraglock); while (blen < ap->alen) { @@ -2797,6 +2761,35 @@ xfs_bmap_btalloc( blen = longest; } else notinit = 1; + + if (xfs_inode_is_filestream(ap->ip)) { + if (blen >= ap->alen) + break; + + if (ap->userdata) { + /* + * If startag is an invalid AG, we've + * come here once before and + * xfs_filestream_new_ag picked the + * best currently available. + * + * Don't continue looping, since we + * could loop forever. + */ + if (startag == NULLAGNUMBER) + break; + + error = xfs_filestream_new_ag(ap, &ag); + if (error) { + up_read(&mp->m_peraglock); + return error; + } + + /* loop again to set 'blen'*/ + startag = NULLAGNUMBER; + continue; + } + } if (++ag == mp->m_sb.sb_agcount) ag = 0; if (ag == startag) @@ -2821,24 +2814,34 @@ xfs_bmap_btalloc( */ else args.minlen = ap->alen; + + /* + * set the failure fallback case to look in the selected + * AG as the stream may have moved. + */ + if (xfs_inode_is_filestream(ap->ip)) + ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0); } else if (ap->low) { - args.type = XFS_ALLOCTYPE_START_BNO; + if (xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_START_BNO; args.total = args.minlen = ap->minlen; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.total = ap->total; args.minlen = ap->minlen; } - if (unlikely(ap->userdata && ap->ip->i_d.di_extsize && - (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) { - args.prod = ap->ip->i_d.di_extsize; + /* apply extent size hints if obtained earlier */ + if (unlikely(align)) { + args.prod = align; if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); - } else if (mp->m_sb.sb_blocksize >= NBPP) { + } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { args.prod = 1; args.mod = 0; } else { - args.prod = NBPP >> mp->m_sb.sb_blocklog; + args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } @@ -2973,7 +2976,7 @@ STATIC int xfs_bmap_alloc( xfs_bmalloca_t *ap) /* bmap alloc argument struct */ { - if ((ap->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && ap->userdata) + if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) return xfs_bmap_rtalloc(ap); return xfs_bmap_btalloc(ap); } @@ -2999,7 +3002,7 @@ xfs_bmap_btree_to_extents( int error; /* error return value */ xfs_ifork_t *ifp; /* inode fork data */ xfs_mount_t *mp; /* mount point structure */ - xfs_bmbt_ptr_t *pp; /* ptr to block address */ + __be64 *pp; /* ptr to block address */ xfs_bmbt_block_t *rblock;/* root btree block */ ifp = XFS_IFORK_PTR(ip, whichfork); @@ -3011,12 +3014,12 @@ xfs_bmap_btree_to_extents( ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1); mp = ip->i_mount; pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); *logflagsp = 0; #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1))) + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) return error; #endif - cbno = INT_GET(*pp, ARCH_CONVERT); if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF))) return error; @@ -3060,12 +3063,9 @@ xfs_bmap_del_extent( xfs_fileoff_t del_endoff; /* first offset past del */ int delay; /* current block is delayed allocated */ int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_t *ep; /* current extent entry pointer */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ int error; /* error return value */ int flags; /* inode logging flags */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_del_extent"; -#endif xfs_bmbt_irec_t got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -3103,8 +3103,7 @@ xfs_bmap_del_extent( /* * Realtime allocation. Free it and record di_nblocks update. */ - if (whichfork == XFS_DATA_FORK && - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { xfs_fsblock_t bno; xfs_filblks_t len; @@ -3140,7 +3139,7 @@ xfs_bmap_del_extent( got.br_startblock, got.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } da_old = da_new = 0; } else { @@ -3159,7 +3158,7 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ - xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork); + XFS_BMAP_TRACE_DELETE("3", ip, idx, 1, whichfork); xfs_iext_remove(ifp, idx, 1); ifp->if_lastex = idx; if (delay) @@ -3173,14 +3172,14 @@ xfs_bmap_del_extent( } if ((error = xfs_bmbt_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); break; case 2: /* * Deleting the first part of the extent. */ - xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("2", ip, idx, whichfork); xfs_bmbt_set_startoff(ep, del_endoff); temp = got.br_blockcount - del->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); @@ -3189,13 +3188,13 @@ xfs_bmap_del_extent( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "2", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); da_new = temp; break; } xfs_bmbt_set_startblock(ep, del_endblock); - xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); if (!cur) { flags |= XFS_ILOG_FEXT(whichfork); break; @@ -3211,19 +3210,19 @@ xfs_bmap_del_extent( * Deleting the last part of the extent. */ temp = got.br_blockcount - del->br_blockcount; - xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("1", ip, idx, whichfork); xfs_bmbt_set_blockcount(ep, temp); ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "1", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); da_new = temp; break; } - xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); if (!cur) { flags |= XFS_ILOG_FEXT(whichfork); break; @@ -3240,7 +3239,7 @@ xfs_bmap_del_extent( * Deleting the middle of the extent. */ temp = del->br_startoff - got.br_startoff; - xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, whichfork); xfs_bmbt_set_blockcount(ep, temp); new.br_startoff = del_endoff; temp2 = got_endoff - del_endoff; @@ -3277,7 +3276,7 @@ xfs_bmap_del_extent( got.br_startblock, temp, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); /* * Update the btree record back * to the original value. @@ -3298,7 +3297,7 @@ xfs_bmap_del_extent( error = XFS_ERROR(ENOSPC); goto done; } - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } else flags |= XFS_ILOG_FEXT(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -3327,8 +3326,8 @@ xfs_bmap_del_extent( } } } - xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork); - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL, + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, whichfork); + XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 1, &new, NULL, whichfork); xfs_iext_insert(ifp, idx + 1, 1, &new); ifp->if_lastex = idx + 1; @@ -3357,7 +3356,7 @@ xfs_bmap_del_extent( */ ASSERT(da_old >= da_new); if (da_old > da_new) - xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new), + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), rsvd); if (delta) { /* DELTA: report the original extent. */ @@ -3411,7 +3410,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_rec_t *arp; /* child record pointer */ xfs_bmbt_block_t *block; /* btree root block */ xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ xfs_extnum_t i, cnt; /* extent record index */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -3500,8 +3499,8 @@ xfs_bmap_extents_to_btree( for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { - arp->l0 = INT_GET(ep->l0, ARCH_CONVERT); - arp->l1 = INT_GET(ep->l1, ARCH_CONVERT); + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); arp++; cnt++; } } @@ -3512,9 +3511,9 @@ xfs_bmap_extents_to_btree( */ kp = XFS_BMAP_KEY_IADDR(block, 1, cur); arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); - INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp)); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); pp = XFS_BMAP_PTR_IADDR(block, 1, cur); - INT_SET(*pp, ARCH_CONVERT, args.fsbno); + *pp = cpu_to_be64(args.fsbno); /* * Do all this logging at the end so that * the root is at the right level. @@ -3541,6 +3540,7 @@ xfs_bmap_forkoff_reset( if (whichfork == XFS_ATTR_FORK && (ip->i_d.di_format != XFS_DINODE_FMT_DEV) && (ip->i_d.di_format != XFS_DINODE_FMT_UUID) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) { ip->i_d.di_forkoff = mp->m_attroffset >> 3; ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / @@ -3567,9 +3567,6 @@ xfs_bmap_local_to_extents( { int error; /* error return value */ int flags; /* logging flags returned */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_local_to_extents"; -#endif xfs_ifork_t *ifp; /* inode fork pointer */ /* @@ -3585,7 +3582,7 @@ xfs_bmap_local_to_extents( if (ifp->if_bytes) { xfs_alloc_arg_t args; /* allocation arguments */ xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ + xfs_bmbt_rec_host_t *ep;/* extent record pointer */ args.tp = tp; args.mp = ip->i_mount; @@ -3624,7 +3621,7 @@ xfs_bmap_local_to_extents( xfs_iext_add(ifp, 0, 1); ep = xfs_iext_get_ext(ifp, 0); xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, 1); ip->i_d.di_nblocks = 1; XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, @@ -3650,7 +3647,7 @@ done: * entry (null if none). Else, *lastxp will be set to the index * of the found entry; *gotp will contain the entry. */ -xfs_bmbt_rec_t * /* pointer to found extent entry */ +xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_multi_extents( xfs_ifork_t *ifp, /* inode fork pointer */ xfs_fileoff_t bno, /* block number searched for */ @@ -3659,7 +3656,7 @@ xfs_bmap_search_multi_extents( xfs_bmbt_irec_t *gotp, /* out: extent entry found */ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ { - xfs_bmbt_rec_t *ep; /* extent record pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ xfs_extnum_t lastx; /* last extent index */ /* @@ -3701,37 +3698,40 @@ xfs_bmap_search_multi_extents( * Else, *lastxp will be set to the index of the found * entry; *gotp will contain the entry. */ -STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_extents( xfs_inode_t *ip, /* incore inode pointer */ xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ + int fork, /* data or attr fork */ int *eofp, /* out: end of file found */ xfs_extnum_t *lastxp, /* out: last extent index */ xfs_bmbt_irec_t *gotp, /* out: extent entry found */ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ { xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - int rt; /* realtime flag */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ XFS_STATS_INC(xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, fork); ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); - if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) { - cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, (long long)ip->i_ino, + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x\n", + (unsigned long long)ip->i_ino, (unsigned long long)gotp->br_startblock, (unsigned long long)gotp->br_startoff, (unsigned long long)gotp->br_blockcount, - gotp->br_state); - } - return ep; + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; } @@ -3744,16 +3744,16 @@ ktrace_t *xfs_bmap_trace_buf; STATIC void xfs_bmap_trace_addentry( int opcode, /* operation */ - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(ies) */ xfs_extnum_t cnt, /* count of entries, 1 or 2 */ - xfs_bmbt_rec_t *r1, /* first record */ - xfs_bmbt_rec_t *r2, /* second record or null */ + xfs_bmbt_rec_host_t *r1, /* first record */ + xfs_bmbt_rec_host_t *r2, /* second record or null */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_t tr2; + xfs_bmbt_rec_host_t tr2; ASSERT(cnt == 1 || cnt == 2); ASSERT(r1 != NULL); @@ -3803,7 +3803,7 @@ xfs_bmap_trace_addentry( */ STATIC void xfs_bmap_trace_delete( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) deleted */ @@ -3825,7 +3825,7 @@ xfs_bmap_trace_delete( */ STATIC void xfs_bmap_trace_insert( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) inserted */ @@ -3834,8 +3834,8 @@ xfs_bmap_trace_insert( xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_t tr1; /* compressed record 1 */ - xfs_bmbt_rec_t tr2; /* compressed record 2 if needed */ + xfs_bmbt_rec_host_t tr1; /* compressed record 1 */ + xfs_bmbt_rec_host_t tr2; /* compressed record 2 if needed */ xfs_bmbt_set_all(&tr1, r1); if (cnt == 2) { @@ -3854,7 +3854,7 @@ xfs_bmap_trace_insert( */ STATIC void xfs_bmap_trace_post_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry updated */ @@ -3872,7 +3872,7 @@ xfs_bmap_trace_post_update( */ STATIC void xfs_bmap_trace_pre_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry to be updated */ @@ -3962,7 +3962,6 @@ xfs_bmap_add_attrfork( xfs_bmap_free_t flist; /* freed extent records */ xfs_mount_t *mp; /* mount structure */ xfs_trans_t *tp; /* transaction pointer */ - unsigned long s; /* spinlock spl value */ int blks; /* space reservation */ int version = 1; /* superblock attr version */ int committed; /* xaction was committed */ @@ -4001,7 +4000,7 @@ xfs_bmap_add_attrfork( ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } ASSERT(ip->i_d.di_anextents == 0); - VN_HOLD(XFS_ITOV(ip)); + IHOLD(ip); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -4055,35 +4054,34 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) goto error2; - if (!XFS_SB_VERSION_HASATTR(&mp->m_sb) || - (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2)) { + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { __int64_t sbfields = 0; - s = XFS_SB_LOCK(mp); - if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) { - XFS_SB_VERSION_ADDATTR(&mp->m_sb); + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); sbfields |= XFS_SB_VERSIONNUM; } - if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2) { - XFS_SB_VERSION_ADDATTR2(&mp->m_sb); + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); } if (sbfields) { - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, sbfields); } else - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); } - if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) + if ((error = xfs_bmap_finish(&tp, &flist, &committed))) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_df.if_ext_max == XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; error2: xfs_bmap_cancel(&flist); error1: - ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE)); xfs_iunlock(ip, XFS_ILOCK_EXCL); error0: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); @@ -4162,16 +4160,21 @@ xfs_bmap_compute_maxlevels( * number of leaf entries, is controlled by the type of di_nextents * (a signed 32-bit number, xfs_extnum_t), or by di_anextents * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be (m_attroffset >> 3) + * because we could have mounted with ATTR2 and then mounted back + * with ATTR1, keeping the di_forkoff's fixed but probably at + * various positions. Therefore, for both ATTR1 and ATTR2 + * we have to assume the worst case scenario of a minimum size + * available. */ if (whichfork == XFS_DATA_FORK) { maxleafents = MAXEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? - XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); } else { maxleafents = MAXAEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? - XFS_BMDR_SPACE_CALC(MINABTPTRS) : - mp->m_sb.sb_inodesize - mp->m_attroffset; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); } maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); minleafrecs = mp->m_bmap_dmnr[0]; @@ -4207,7 +4210,6 @@ int /* error */ xfs_bmap_finish( xfs_trans_t **tp, /* transaction pointer addr */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_fsblock_t firstblock, /* controlled ag for allocs */ int *committed) /* xact committed or not */ { xfs_efd_log_item_t *efd; /* extent free data */ @@ -4233,7 +4235,7 @@ xfs_bmap_finish( logres = ntp->t_log_res; logcount = ntp->t_log_count; ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0, NULL); + error = xfs_trans_commit(*tp, 0); *tp = ntp; *committed = 1; /* @@ -4309,7 +4311,6 @@ xfs_bmap_first_unused( xfs_fileoff_t *first_unused, /* unused block */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ int error; /* error return value */ int idx; /* extent record index */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -4333,7 +4334,7 @@ xfs_bmap_first_unused( lowest = *first_unused; nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - ep = xfs_iext_get_ext(ifp, idx); + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); off = xfs_bmbt_get_startoff(ep); /* * See if the hole before this extent will work. @@ -4364,7 +4365,7 @@ xfs_bmap_last_before( { xfs_fileoff_t bno; /* input file offset */ int eof; /* hit end of file */ - xfs_bmbt_rec_t *ep; /* pointer to last extent */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ int error; /* error return value */ xfs_bmbt_irec_t got; /* current extent value */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -4410,7 +4411,7 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, /* last block */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_t *ep; /* pointer to last extent */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ int error; /* error return value */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_extnum_t nextents; /* number of extent entries */ @@ -4447,14 +4448,17 @@ xfs_bmap_one_block( xfs_inode_t *ip, /* incore inode */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_t *ep; /* ptr to fork's extent */ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ xfs_ifork_t *ifp; /* inode fork pointer */ int rval; /* return value */ xfs_bmbt_irec_t s; /* internal version of extent */ #ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize; + if (whichfork == XFS_DATA_FORK) { + return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? + (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : + (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + } #endif /* !DEBUG */ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) return 0; @@ -4466,7 +4470,7 @@ xfs_bmap_one_block( xfs_bmbt_get_all(ep, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) - ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); return rval; } @@ -4487,14 +4491,11 @@ xfs_bmap_read_extents( xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_read_extents"; -#endif xfs_extnum_t i, j; /* index into the extents list */ xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ + __be64 *pp; /* pointer to block address */ /* REFERENCED */ xfs_extnum_t room; /* number of entries there's room for */ @@ -4510,10 +4511,10 @@ xfs_bmap_read_extents( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. @@ -4528,12 +4529,9 @@ xfs_bmap_read_extents( error0); if (level == 0) break; - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, - 1, mp->m_bmap_dmxr[1]); - XFS_WANT_CORRUPTED_GOTO( - XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), - error0); - bno = INT_GET(*pp, ARCH_CONVERT); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); } /* @@ -4545,7 +4543,7 @@ xfs_bmap_read_extents( * Loop over all leaf nodes. Copy information to the extent records. */ for (;;) { - xfs_bmbt_rec_t *frp, *trp; + xfs_bmbt_rec_t *frp; xfs_fsblock_t nextbno; xfs_extnum_t num_recs; xfs_extnum_t start; @@ -4574,13 +4572,12 @@ xfs_bmap_read_extents( /* * Copy records into the extent records. */ - frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, 1, mp->m_bmap_dmxr[0]); + frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); start = i; for (j = 0; j < num_recs; j++, i++, frp++) { - trp = xfs_iext_get_ext(ifp, i); - trp->l0 = INT_GET(frp->l0, ARCH_CONVERT); - trp->l1 = INT_GET(frp->l1, ARCH_CONVERT); + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); } if (exntf == XFS_EXTFMT_NOSTATE) { /* @@ -4610,7 +4607,7 @@ xfs_bmap_read_extents( } ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); - xfs_bmap_trace_exlist(fname, ip, i, whichfork); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: xfs_trans_brelse(tp, bp); @@ -4623,12 +4620,12 @@ error0: */ void xfs_bmap_trace_exlist( - char *fname, /* function name */ + const char *fname, /* function name */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_t *ep; /* current extent record */ + xfs_bmbt_rec_host_t *ep; /* current extent record */ xfs_extnum_t idx; /* extent record index */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t s; /* file extent record */ @@ -4638,7 +4635,7 @@ xfs_bmap_trace_exlist( for (idx = 0; idx < cnt; idx++) { ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_get_all(ep, &s); - xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL, + XFS_BMAP_TRACE_INSERT("exlist", ip, idx, 1, &s, NULL, whichfork); } } @@ -4724,7 +4721,7 @@ xfs_bmapi( xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* we've hit the end of extents */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return */ xfs_bmbt_irec_t got; /* current file extent record */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -4878,12 +4875,7 @@ xfs_bmapi( xfs_extlen_t extsz; /* Figure out the extent size, adjust alen */ - if (rt) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } + extsz = xfs_get_extsz_hint(ip); if (extsz) { error = xfs_bmap_extsize_align(mp, &got, &prev, extsz, @@ -4926,28 +4918,28 @@ xfs_bmapi( if (rt) { error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -(extsz), (flags & + -((int64_t)extsz), (flags & XFS_BMAPI_RSVBLOCKS)); } else { error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - -(alen), (flags & + -((int64_t)alen), (flags & XFS_BMAPI_RSVBLOCKS)); } if (!error) { error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - -(indlen), (flags & + -((int64_t)indlen), (flags & XFS_BMAPI_RSVBLOCKS)); if (error && rt) xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - extsz, (flags & + (int64_t)extsz, (flags & XFS_BMAPI_RSVBLOCKS)); else if (error) xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - alen, (flags & + (int64_t)alen, (flags & XFS_BMAPI_RSVBLOCKS)); } @@ -5062,7 +5054,7 @@ xfs_bmapi( * A wasdelay extent has been initialized, so * shouldn't be flagged as unwritten. */ - if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) { if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) got.br_state = XFS_EXT_UNWRITTEN; } @@ -5229,10 +5221,10 @@ xfs_bmapi( * Else go on to the next record. */ ep = xfs_iext_get_ext(ifp, ++lastx); - if (lastx >= nextents) { + prev = got; + if (lastx >= nextents) eof = 1; - prev = got; - } else + else xfs_bmbt_get_all(ep, &got); } ifp->if_lastex = lastx; @@ -5380,7 +5372,7 @@ xfs_bunmapi( xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ int eof; /* is deleting at eof */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ xfs_extnum_t extno; /* extent number in list */ xfs_bmbt_irec_t got; /* current extent record */ @@ -5502,7 +5494,7 @@ xfs_bunmapi( * get rid of part of a realtime extent. */ if (del.br_state == XFS_EXT_UNWRITTEN || - !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { /* * This piece is unwritten, or we're not * using unwritten extents. Skip over it. @@ -5554,7 +5546,7 @@ xfs_bunmapi( } else if ((del.br_startoff == start && (del.br_state == XFS_EXT_UNWRITTEN || xfs_trans_get_block_res(tp) == 0)) || - !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { /* * Can't make it unwritten. There isn't * a full extent here so just skip it. @@ -5613,13 +5605,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int)rtexts, rsvd); + (int64_t)rtexts, rsvd); (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - (int)del.br_blockcount, rsvd); + (int64_t)del.br_blockcount, rsvd); (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5745,11 +5737,44 @@ error0: } /* + * returns 1 for success, 0 if we failed to map the extent. + */ +STATIC int +xfs_getbmapx_fix_eof_hole( + xfs_inode_t *ip, /* xfs incore inode pointer */ + struct getbmap *out, /* output structure */ + int prealloced, /* this is a file with + * preallocated data space */ + __int64_t end, /* last block requested */ + xfs_fsblock_t startblock) +{ + __int64_t fixlen; + xfs_mount_t *mp; /* file system mount point */ + + if (startblock == HOLESTARTBLOCK) { + mp = ip->i_mount; + out->bmv_block = -1; + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); + fixlen -= out->bmv_offset; + if (prealloced && out->bmv_offset + out->bmv_length == end) { + /* Came to hole at EOF. Trim it. */ + if (fixlen <= 0) + return 0; + out->bmv_length = fixlen; + } + } else { + out->bmv_block = XFS_FSB_TO_DB(ip, startblock); + } + + return 1; +} + +/* * Fcntl interface to xfs_bmapi. */ int /* error code */ xfs_getbmap( - bhv_desc_t *bdp, /* XFS behavior descriptor*/ + xfs_inode_t *ip, struct getbmap *bmv, /* user bmap structure */ void __user *ap, /* pointer to user's array */ int interface) /* interface flags */ @@ -5758,8 +5783,6 @@ xfs_getbmap( int error; /* return value */ __int64_t fixlen; /* length for -1 case */ int i; /* extent number */ - xfs_inode_t *ip; /* xfs incore inode pointer */ - bhv_vnode_t *vp; /* corresponding vnode */ int lock; /* lock state */ xfs_bmbt_irec_t *map; /* buffer for user's data */ xfs_mount_t *mp; /* file system mount point */ @@ -5776,8 +5799,6 @@ xfs_getbmap( int bmapi_flags; /* flags for xfs_bmapi */ __int32_t oflags; /* getbmapx bmv_oflags field */ - vp = BHV_TO_VNODE(bdp); - ip = XFS_BHVTOI(bdp); mp = ip->i_mount; whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; @@ -5796,11 +5817,10 @@ xfs_getbmap( * could misinterpret holes in a DMAPI file as true holes, * when in fact they may represent offline user data. */ - if ( (interface & BMV_IF_NO_DMAPI_READ) == 0 - && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) - && whichfork == XFS_DATA_FORK) { - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL); + if ((interface & BMV_IF_NO_DMAPI_READ) == 0 && + DM_EVENT_ENABLED(ip, DM_EVENT_READ) && + whichfork == XFS_DATA_FORK) { + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); if (error) return XFS_ERROR(error); } @@ -5823,14 +5843,13 @@ xfs_getbmap( ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) return XFS_ERROR(EINVAL); if (whichfork == XFS_DATA_FORK) { - if ((ip->i_d.di_extsize && (ip->i_d.di_flags & - (XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) || + if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; fixlen = XFS_MAXIOFFSET(mp); } else { prealloced = 0; - fixlen = ip->i_d.di_size; + fixlen = ip->i_size; } } else { prealloced = 0; @@ -5854,9 +5873,15 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) { + if (whichfork == XFS_DATA_FORK && + (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ - error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF); + error = xfs_flush_pages(ip, (xfs_off_t)0, + -1, 0, FI_REMAPF); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return error; + } } ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); @@ -5906,18 +5931,15 @@ xfs_getbmap( out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); if (map[i].br_startblock == HOLESTARTBLOCK && - ((prealloced && out.bmv_offset + out.bmv_length == bmvend) || - whichfork == XFS_ATTR_FORK )) { - /* - * came to hole at end of file or the end of - attribute fork - */ + whichfork == XFS_ATTR_FORK) { + /* came to the end of attribute fork */ goto unlock_and_return; } else { - out.bmv_block = - (map[i].br_startblock == HOLESTARTBLOCK) ? - -1 : - XFS_FSB_TO_DB(ip, map[i].br_startblock); + if (!xfs_getbmapx_fix_eof_hole(ip, &out, + prealloced, bmvend, + map[i].br_startblock)) { + goto unlock_and_return; + } /* return either getbmap/getbmapx structure. */ if (interface & BMV_IF_EXTENDED) { @@ -5956,7 +5978,7 @@ unlock_and_return: xfs_iunlock_map_shared(ip, lock); xfs_iunlock(ip, XFS_IOLOCK_SHARED); - kmem_free(map, subnex * sizeof(*map)); + kmem_free(map); return error; } @@ -5976,7 +5998,7 @@ xfs_bmap_isaeof( { int error; /* error return value */ xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_t *lastrec; /* extent record pointer */ + xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ xfs_extnum_t nextents; /* number of file extents */ xfs_bmbt_irec_t s; /* expanded extent record */ @@ -6020,7 +6042,7 @@ xfs_bmap_eof( xfs_fsblock_t blockcount; /* extent block count */ int error; /* error return value */ xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_t *lastrec; /* extent record pointer */ + xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ xfs_extnum_t nextents; /* number of file extents */ xfs_fileoff_t startoff; /* extent starting file offset */ @@ -6045,32 +6067,6 @@ xfs_bmap_eof( } #ifdef DEBUG -/* - * Check that the extents list for the inode ip is in the right order. - */ -STATIC void -xfs_bmap_check_extents( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t *ep; /* current extent entry */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extents in list */ - xfs_bmbt_rec_t *nextp; /* next extent entry */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ep = xfs_iext_get_ext(ifp, 0); - for (idx = 0; idx < nextents - 1; idx++) { - nextp = xfs_iext_get_ext(ifp, idx + 1); - xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, - (void *)(nextp)); - ep = nextp; - } -} - STATIC xfs_buf_t * xfs_bmap_get_bp( @@ -6100,7 +6096,7 @@ xfs_bmap_get_bp( tp = cur->bc_tp; licp = &tp->t_items; while (!bp && licp != NULL) { - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { licp = licp->lic_next; continue; } @@ -6110,11 +6106,11 @@ xfs_bmap_get_bp( xfs_buf_log_item_t *bip; xfs_buf_t *lbp; - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); lip = lidp->lid_item; if (lip->li_type != XFS_LI_BUF) continue; @@ -6141,7 +6137,7 @@ xfs_check_block( short sz) { int i, j, dmxr; - xfs_bmbt_ptr_t *pp, *thispa; /* pointer to block address */ + __be64 *pp, *thispa; /* pointer to block address */ xfs_bmbt_key_t *prevp, *keyp; ASSERT(be16_to_cpu(block->bb_level) > 0); @@ -6153,8 +6149,7 @@ xfs_check_block( if (root) { keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz); } else { - keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, i, dmxr); + keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i); } if (prevp) { @@ -6169,23 +6164,21 @@ xfs_check_block( if (root) { pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz); } else { - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, i, dmxr); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr); } for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { if (root) { thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz); } else { - thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, j, dmxr); + thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j, + dmxr); } - if (INT_GET(*thispa, ARCH_CONVERT) == - INT_GET(*pp, ARCH_CONVERT)) { + if (*thispa == *pp) { cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", - __FUNCTION__, j, i, - INT_GET(*thispa, ARCH_CONVERT)); + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); panic("%s: ptrs are equal in node\n", - __FUNCTION__); + __func__); } } } @@ -6210,9 +6203,9 @@ xfs_bmap_check_leaf_extents( xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ + __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ - xfs_bmbt_rec_t *lastp; /* pointer to previous extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ int bp_release = 0; @@ -6231,10 +6224,12 @@ xfs_bmap_check_leaf_extents( ASSERT(level > 0); xfs_check_block(block, mp, 1, ifp->if_broot_bytes); pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. @@ -6263,10 +6258,9 @@ xfs_bmap_check_leaf_extents( */ xfs_check_block(block, mp, 0, 0); - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, - 1, mp->m_bmap_dmxr[1]); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0); - bno = INT_GET(*pp, ARCH_CONVERT); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -6281,7 +6275,6 @@ xfs_bmap_check_leaf_extents( /* * Loop over all leaf nodes checking that all extents are in the right order. */ - lastp = NULL; for (;;) { xfs_fsblock_t nextbno; xfs_extnum_t num_recs; @@ -6301,21 +6294,17 @@ xfs_bmap_check_leaf_extents( * conform with the first entry in this one. */ - ep = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, 1, mp->m_bmap_dmxr[0]); + ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); + if (i) { + xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep); + } for (j = 1; j < num_recs; j++) { - nextp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, j + 1, mp->m_bmap_dmxr[0]); - if (lastp) { - xfs_btree_check_rec(XFS_BTNUM_BMAP, - (void *)lastp, (void *)ep); - } - xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, - (void *)(nextp)); - lastp = ep; + nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1); + xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp); ep = nextp; } + last = *ep; i += num_recs; if (bp_release) { bp_release = 0; @@ -6346,13 +6335,13 @@ xfs_bmap_check_leaf_extents( return; error0: - cmn_err(CE_WARN, "%s: at error0", __FUNCTION__); + cmn_err(CE_WARN, "%s: at error0", __func__); if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", - __FUNCTION__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__); + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); return; } #endif @@ -6372,19 +6361,15 @@ xfs_bmap_count_blocks( xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ + __be64 *pp; /* pointer to block address */ bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - if (unlikely(xfs_bmap_count_leaves(ifp, 0, + xfs_bmap_count_leaves(ifp, 0, ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } + count); return 0; } @@ -6395,10 +6380,10 @@ xfs_bmap_count_blocks( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, @@ -6413,7 +6398,7 @@ xfs_bmap_count_blocks( * Recursively walks each level of a btree * to count total fsblocks is use. */ -int /* error */ +STATIC int /* error */ xfs_bmap_count_tree( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ @@ -6425,7 +6410,7 @@ xfs_bmap_count_tree( int error; xfs_buf_t *bp, *nbp; int level = levelin; - xfs_bmbt_ptr_t *pp; + __be64 *pp; xfs_fsblock_t bno = blockno; xfs_fsblock_t nextbno; xfs_bmbt_block_t *block, *nextblock; @@ -6450,9 +6435,8 @@ xfs_bmap_count_tree( } /* Dive to the next level */ - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); - bno = INT_GET(*pp, ARCH_CONVERT); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); if (unlikely((error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { xfs_trans_brelse(tp, bp); @@ -6466,13 +6450,7 @@ xfs_bmap_count_tree( for (;;) { nextbno = be64_to_cpu(block->bb_rightsib); numrecs = be16_to_cpu(block->bb_numrecs); - if (unlikely(xfs_bmap_disk_count_leaves(ifp, mp, - 0, block, numrecs, count) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(2)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } + xfs_bmap_disk_count_leaves(0, block, numrecs, count); xfs_trans_brelse(tp, bp); if (nextbno == NULLFSBLOCK) break; @@ -6490,7 +6468,7 @@ xfs_bmap_count_tree( /* * Count leaf blocks given a range of extent records. */ -int +STATIC void xfs_bmap_count_leaves( xfs_ifork_t *ifp, xfs_extnum_t idx, @@ -6498,23 +6476,19 @@ xfs_bmap_count_leaves( int *count) { int b; - xfs_bmbt_rec_t *frp; for (b = 0; b < numrecs; b++) { - frp = xfs_iext_get_ext(ifp, idx + b); + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); *count += xfs_bmbt_get_blockcount(frp); } - return 0; } /* * Count leaf blocks given a range of extent records originally * in btree format. */ -int +STATIC void xfs_bmap_disk_count_leaves( - xfs_ifork_t *ifp, - xfs_mount_t *mp, xfs_extnum_t idx, xfs_bmbt_block_t *block, int numrecs, @@ -6524,9 +6498,7 @@ xfs_bmap_disk_count_leaves( xfs_bmbt_rec_t *frp; for (b = 1; b <= numrecs; b++) { - frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, idx + b, mp->m_bmap_dmxr[0]); + frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b); *count += xfs_bmbt_disk_get_blockcount(frp); } - return 0; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 80e9340..9f3e3a8 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -25,6 +25,8 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; +extern kmem_zone_t *xfs_bmap_free_item_zone; + /* * DELTA: describe a change to the in-core extent list. * @@ -52,12 +54,23 @@ typedef struct xfs_bmap_free_item /* * Header for free extent list. + * + * xbf_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. */ typedef struct xfs_bmap_free { xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ int xbf_count; /* count of items on list */ - int xbf_low; /* kludge: alloc in low mode */ + int xbf_low; /* alloc in low mode */ } xfs_bmap_free_t; #define XFS_BMAP_MAX_NMAP 4 @@ -144,12 +157,14 @@ extern ktrace_t *xfs_bmap_trace_buf; */ void xfs_bmap_trace_exlist( - char *fname, /* function name */ + const char *fname, /* function name */ struct xfs_inode *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in list */ int whichfork); /* data or attr fork */ +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ + xfs_bmap_trace_exlist(__func__,ip,c,w) #else -#define xfs_bmap_trace_exlist(f,ip,c,w) +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif /* @@ -202,7 +217,6 @@ int /* error */ xfs_bmap_finish( struct xfs_trans **tp, /* transaction pointer addr */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_fsblock_t firstblock, /* controlled a.g. for allocs */ int *committed); /* xact committed or not */ /* @@ -334,7 +348,7 @@ xfs_bunmapi( */ int /* error code */ xfs_getbmap( - bhv_desc_t *bdp, /* XFS behavior descriptor*/ + xfs_inode_t *ip, struct getbmap *bmv, /* user bmap structure */ void __user *ap, /* pointer to user's array */ int iflags); /* interface flags */ @@ -377,7 +391,7 @@ xfs_check_nostate_extents( * entry (null if none). Else, *lastxp will be set to the index * of the found entry; *gotp will contain the entry. */ -xfs_bmbt_rec_t * +xfs_bmbt_rec_host_t * xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 18fb738..23efad2 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -58,7 +58,7 @@ STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *); STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *); STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *, - xfs_bmbt_key_t *, xfs_btree_cur_t **, int *); + __uint64_t *, xfs_btree_cur_t **, int *); STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int); @@ -76,7 +76,7 @@ static char EXIT[] = "exit"; */ STATIC void xfs_bmbt_trace_enter( - char *func, + const char *func, xfs_btree_cur_t *cur, char *s, int type, @@ -117,7 +117,7 @@ xfs_bmbt_trace_enter( */ STATIC void xfs_bmbt_trace_argbi( - char *func, + const char *func, xfs_btree_cur_t *cur, xfs_buf_t *b, int i, @@ -134,7 +134,7 @@ xfs_bmbt_trace_argbi( */ STATIC void xfs_bmbt_trace_argbii( - char *func, + const char *func, xfs_btree_cur_t *cur, xfs_buf_t *b, int i0, @@ -153,7 +153,7 @@ xfs_bmbt_trace_argbii( */ STATIC void xfs_bmbt_trace_argfffi( - char *func, + const char *func, xfs_btree_cur_t *cur, xfs_dfiloff_t o, xfs_dfsbno_t b, @@ -172,7 +172,7 @@ xfs_bmbt_trace_argfffi( */ STATIC void xfs_bmbt_trace_argi( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, int line) @@ -188,20 +188,15 @@ xfs_bmbt_trace_argi( */ STATIC void xfs_bmbt_trace_argifk( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, xfs_fsblock_t f, - xfs_bmbt_key_t *k, + xfs_dfiloff_t o, int line) { - xfs_dfsbno_t d; - xfs_dfiloff_t o; - - d = (xfs_dfsbno_t)f; - o = INT_GET(k->br_startoff, ARCH_CONVERT); xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, - i, d >> 32, (int)d, o >> 32, + i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32, (int)o, 0, 0, 0, 0, 0, 0); } @@ -211,7 +206,7 @@ xfs_bmbt_trace_argifk( */ STATIC void xfs_bmbt_trace_argifr( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, xfs_fsblock_t f, @@ -240,7 +235,7 @@ xfs_bmbt_trace_argifr( */ STATIC void xfs_bmbt_trace_argik( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, xfs_bmbt_key_t *k, @@ -248,7 +243,7 @@ xfs_bmbt_trace_argik( { xfs_dfiloff_t o; - o = INT_GET(k->br_startoff, ARCH_CONVERT); + o = be64_to_cpu(k->br_startoff); xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, i, o >> 32, (int)o, 0, 0, 0, 0, 0, @@ -260,18 +255,19 @@ xfs_bmbt_trace_argik( */ STATIC void xfs_bmbt_trace_cursor( - char *func, + const char *func, xfs_btree_cur_t *cur, char *s, int line) { - xfs_bmbt_rec_t r; + xfs_bmbt_rec_host_t r; xfs_bmbt_set_all(&r, &cur->bc_rec.b); xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line, (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) | cur->bc_private.b.allocated, - INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT), + r.l0 >> 32, (int)r.l0, + r.l1 >> 32, (int)r.l1, (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1], (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3], (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], @@ -279,27 +275,27 @@ xfs_bmbt_trace_cursor( } #define XFS_BMBT_TRACE_ARGBI(c,b,i) \ - xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__) + xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__) #define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ - xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__) + xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__) #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ - xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__) + xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__) #define XFS_BMBT_TRACE_ARGI(c,i) \ - xfs_bmbt_trace_argi(fname, c, i, __LINE__) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) \ - xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__) + xfs_bmbt_trace_argi(__func__, c, i, __LINE__) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \ + xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__) #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ - xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__) + xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__) #define XFS_BMBT_TRACE_ARGIK(c,i,k) \ - xfs_bmbt_trace_argik(fname, c, i, k, __LINE__) + xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__) #define XFS_BMBT_TRACE_CURSOR(c,s) \ - xfs_bmbt_trace_cursor(fname, c, s, __LINE__) + xfs_bmbt_trace_cursor(__func__, c, s, __LINE__) #else #define XFS_BMBT_TRACE_ARGBI(c,b,i) #define XFS_BMBT_TRACE_ARGBII(c,b,i,j) #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) #define XFS_BMBT_TRACE_ARGI(c,i) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) #define XFS_BMBT_TRACE_ARGIK(c,i,k) #define XFS_BMBT_TRACE_CURSOR(c,s) @@ -323,9 +319,6 @@ xfs_bmbt_delrec( xfs_fsblock_t bno; /* fs-relative block number */ xfs_buf_t *bp; /* buffer for block */ int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_delrec"; -#endif int i; /* loop counter */ int j; /* temp state */ xfs_bmbt_key_t key; /* bmap btree key */ @@ -357,7 +350,7 @@ xfs_bmbt_delrec( XFS_BMBT_TRACE_CURSOR(cur, ENTRY); XFS_BMBT_TRACE_ARGI(cur, level); ptr = cur->bc_ptrs[level]; - tcur = (xfs_btree_cur_t *)0; + tcur = NULL; if (ptr == 0) { XFS_BMBT_TRACE_CURSOR(cur, EXIT); *stat = 0; @@ -382,7 +375,7 @@ xfs_bmbt_delrec( pp = XFS_BMAP_PTR_IADDR(block, 1, cur); #ifdef DEBUG for (i = ptr; i < numrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); goto error0; } @@ -391,7 +384,7 @@ xfs_bmbt_delrec( if (ptr < numrecs) { memmove(&kp[ptr - 1], &kp[ptr], (numrecs - ptr) * sizeof(*kp)); - memmove(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */ + memmove(&pp[ptr - 1], &pp[ptr], (numrecs - ptr) * sizeof(*pp)); xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1); xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1); @@ -404,7 +397,8 @@ xfs_bmbt_delrec( xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1); } if (ptr == 1) { - INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp)); + key.br_startoff = + cpu_to_be64(xfs_bmbt_disk_get_startoff(rp)); kp = &key; } } @@ -621,7 +615,7 @@ xfs_bmbt_delrec( rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG for (i = 0; i < numrrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); goto error0; } @@ -637,7 +631,7 @@ xfs_bmbt_delrec( memcpy(lrp, rrp, numrrecs * sizeof(*lrp)); xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); } - be16_add(&left->bb_numrecs, numrrecs); + be16_add_cpu(&left->bb_numrecs, numrrecs); left->bb_rightsib = right->bb_rightsib; xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS); if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) { @@ -682,47 +676,6 @@ error0: return error; } -#ifdef DEBUG -/* - * Get the data from the pointed-to record. - */ -int -xfs_bmbt_get_rec( - xfs_btree_cur_t *cur, - xfs_fileoff_t *off, - xfs_fsblock_t *bno, - xfs_filblks_t *len, - xfs_exntst_t *state, - int *stat) -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; -#ifdef DEBUG - int error; -#endif - int ptr; - xfs_bmbt_rec_t *rp; - - block = xfs_bmbt_get_block(cur, 0, &bp); - ptr = cur->bc_ptrs[0]; -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) - return error; -#endif - if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) { - *stat = 0; - return 0; - } - rp = XFS_BMAP_REC_IADDR(block, ptr, cur); - *off = xfs_bmbt_disk_get_startoff(rp); - *bno = xfs_bmbt_disk_get_startblock(rp); - *len = xfs_bmbt_disk_get_blockcount(rp); - *state = xfs_bmbt_disk_get_state(rp); - *stat = 1; - return 0; -} -#endif - /* * Insert one record/level. Return information to the caller * allowing the next level up to proceed if necessary. @@ -739,16 +692,13 @@ xfs_bmbt_insrec( xfs_bmbt_block_t *block; /* bmap btree block */ xfs_buf_t *bp; /* buffer for block */ int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_insrec"; -#endif int i; /* loop index */ xfs_bmbt_key_t key; /* bmap btree key */ xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ int logflags; /* inode logging flags */ xfs_fsblock_t nbno; /* new block number */ struct xfs_btree_cur *ncur; /* new btree cursor */ - xfs_bmbt_key_t nkey; /* new btree key value */ + __uint64_t startoff; /* new btree key value */ xfs_bmbt_rec_t nrec; /* new record count */ int optr; /* old key/record index */ xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ @@ -759,9 +709,8 @@ xfs_bmbt_insrec( ASSERT(level < cur->bc_nlevels); XFS_BMBT_TRACE_CURSOR(cur, ENTRY); XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp); - ncur = (xfs_btree_cur_t *)0; - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(recp)); + ncur = NULL; + key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp)); optr = ptr = cur->bc_ptrs[level]; if (ptr == 0) { XFS_BMBT_TRACE_CURSOR(cur, EXIT); @@ -820,7 +769,7 @@ xfs_bmbt_insrec( optr = ptr = cur->bc_ptrs[level]; } else { if ((error = xfs_bmbt_split(cur, level, - &nbno, &nkey, &ncur, + &nbno, &startoff, &ncur, &i))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); @@ -840,7 +789,7 @@ xfs_bmbt_insrec( #endif ptr = cur->bc_ptrs[level]; xfs_bmbt_disk_set_allf(&nrec, - nkey.br_startoff, 0, 0, + startoff, 0, 0, XFS_EXT_NORM); } else { XFS_BMBT_TRACE_CURSOR(cur, @@ -858,7 +807,7 @@ xfs_bmbt_insrec( pp = XFS_BMAP_PTR_IADDR(block, 1, cur); #ifdef DEBUG for (i = numrecs; i >= ptr; i--) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), + if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; @@ -867,17 +816,16 @@ xfs_bmbt_insrec( #endif memmove(&kp[ptr], &kp[ptr - 1], (numrecs - ptr + 1) * sizeof(*kp)); - memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */ + memmove(&pp[ptr], &pp[ptr - 1], (numrecs - ptr + 1) * sizeof(*pp)); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop, - level))) { + if ((error = xfs_btree_check_lptr(cur, *bnop, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif kp[ptr - 1] = key; - INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); + pp[ptr - 1] = cpu_to_be64(*bnop); numrecs++; block->bb_numrecs = cpu_to_be16(numrecs); xfs_bmbt_log_keys(cur, bp, ptr, numrecs); @@ -928,9 +876,6 @@ xfs_bmbt_killroot( #ifdef DEBUG int error; #endif -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_killroot"; -#endif int i; xfs_bmbt_key_t *kp; xfs_inode_t *ip; @@ -979,7 +924,7 @@ xfs_bmbt_killroot( xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork); block = ifp->if_broot; } - be16_add(&block->bb_numrecs, i); + be16_add_cpu(&block->bb_numrecs, i); ASSERT(block->bb_numrecs == cblock->bb_numrecs); kp = XFS_BMAP_KEY_IADDR(block, 1, cur); ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); @@ -988,7 +933,7 @@ xfs_bmbt_killroot( cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); #ifdef DEBUG for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) { + if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1002,7 +947,7 @@ xfs_bmbt_killroot( XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(cur->bc_tp, cbp); cur->bc_bufs[level - 1] = NULL; - be16_add(&block->bb_level, -1); + be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); cur->bc_nlevels--; @@ -1020,9 +965,6 @@ xfs_bmbt_log_keys( int kfirst, int klast) { -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_keys"; -#endif xfs_trans_t *tp; XFS_BMBT_TRACE_CURSOR(cur, ENTRY); @@ -1059,9 +1001,6 @@ xfs_bmbt_log_ptrs( int pfirst, int plast) { -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_ptrs"; -#endif xfs_trans_t *tp; XFS_BMBT_TRACE_CURSOR(cur, ENTRY); @@ -1102,9 +1041,6 @@ xfs_bmbt_lookup( xfs_daddr_t d; xfs_sfiloff_t diff; int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_lookup"; -#endif xfs_fsblock_t fsbno=0; int high; int i; @@ -1132,7 +1068,7 @@ xfs_bmbt_lookup( d = XFS_FSB_TO_DADDR(mp, fsbno); bp = cur->bc_bufs[level]; if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; + bp = NULL; if (!bp) { if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, XFS_BMAP_BTREE_REF))) { @@ -1170,7 +1106,7 @@ xfs_bmbt_lookup( keyno = (low + high) >> 1; if (level > 0) { kkp = kkbase + keyno - 1; - startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT); + startoff = be64_to_cpu(kkp->br_startoff); } else { krp = krbase + keyno - 1; startoff = xfs_bmbt_disk_get_startoff(krp); @@ -1189,13 +1125,13 @@ xfs_bmbt_lookup( if (diff > 0 && --keyno < 1) keyno = 1; pp = XFS_BMAP_PTR_IADDR(block, keyno, cur); + fsbno = be64_to_cpu(*pp); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr(cur, fsbno, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif - fsbno = INT_GET(*pp, ARCH_CONVERT); cur->bc_ptrs[level] = keyno; } } @@ -1242,9 +1178,6 @@ xfs_bmbt_lshift( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_lshift"; -#endif #ifdef DEBUG int i; /* loop counter */ #endif @@ -1313,12 +1246,12 @@ xfs_bmbt_lshift( lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur); rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif - *lpp = *rpp; /* INT_: direct copy */ + *lpp = *rpp; xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs); } else { lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur); @@ -1340,7 +1273,7 @@ xfs_bmbt_lshift( if (level > 0) { #ifdef DEBUG for (i = 0; i < rrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), + if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; @@ -1354,8 +1287,7 @@ xfs_bmbt_lshift( } else { memmove(rrp, rrp + 1, rrecs * sizeof(*rrp)); xfs_bmbt_log_recs(cur, rbp, 1, rrecs); - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(rrp)); + key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp)); rkp = &key; } if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) { @@ -1379,9 +1311,6 @@ xfs_bmbt_rshift( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_rshift"; -#endif int i; /* loop counter */ xfs_bmbt_key_t key; /* bmap btree key */ xfs_buf_t *lbp; /* left buffer pointer */ @@ -1445,7 +1374,7 @@ xfs_bmbt_rshift( rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1454,13 +1383,13 @@ xfs_bmbt_rshift( memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif *rkp = *lkp; - *rpp = *lpp; /* INT_: direct copy */ + *rpp = *lpp; xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); } else { @@ -1469,13 +1398,12 @@ xfs_bmbt_rshift( memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); *rrp = *lrp; xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(rrp)); + key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp)); rkp = &key; } - be16_add(&left->bb_numrecs, -1); + be16_add_cpu(&left->bb_numrecs, -1); xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, 1); + be16_add_cpu(&right->bb_numrecs, 1); #ifdef DEBUG if (level > 0) xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1); @@ -1535,15 +1463,12 @@ xfs_bmbt_split( xfs_btree_cur_t *cur, int level, xfs_fsblock_t *bnop, - xfs_bmbt_key_t *keyp, + __uint64_t *startoff, xfs_btree_cur_t **curp, int *stat) /* success/failure */ { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_split"; -#endif int i; /* loop counter */ xfs_fsblock_t lbno; /* left sibling block number */ xfs_buf_t *lbp; /* left buffer pointer */ @@ -1560,7 +1485,7 @@ xfs_bmbt_split( xfs_bmbt_rec_t *rrp; /* right record pointer */ XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp); + XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff); args.tp = cur->bc_tp; args.mp = cur->bc_mp; lbp = cur->bc_bufs[level]; @@ -1568,12 +1493,27 @@ xfs_bmbt_split( left = XFS_BUF_TO_BMBT_BLOCK(lbp); args.fsbno = cur->bc_private.b.firstblock; args.firstblock = args.fsbno; + args.minleft = 0; if (args.fsbno == NULLFSBLOCK) { args.fsbno = lbno; args.type = XFS_ALLOCTYPE_START_BNO; - } else + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If + * we are converting the middle part of an extent then + * we may need space for two tree splits. + * + * We are relying on the caller to make the correct block + * reservation for this operation to succeed. If the + * reservation amount is insufficient then we may fail a + * block allocation here and corrupt the filesystem. + */ + args.minleft = xfs_trans_get_block_res(args.tp); + } else if (cur->bc_private.b.flist->xbf_low) + args.type = XFS_ALLOCTYPE_START_BNO; + else args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.minleft = args.alignment = args.total = args.isfl = + args.mod = args.alignment = args.total = args.isfl = args.userdata = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; @@ -1585,6 +1525,21 @@ xfs_bmbt_split( XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } + if (args.fsbno == NULLFSBLOCK && args.minleft) { + /* + * Could not find an AG with enough free space to satisfy + * a full btree split. Try again without minleft and if + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minleft = 0; + if ((error = xfs_alloc_vextent(&args))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_private.b.flist->xbf_low = 1; + } if (args.fsbno == NULLFSBLOCK) { XFS_BMBT_TRACE_CURSOR(cur, EXIT); *stat = 0; @@ -1610,7 +1565,7 @@ xfs_bmbt_split( right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2); if ((be16_to_cpu(left->bb_numrecs) & 1) && cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add(&right->bb_numrecs, 1); + be16_add_cpu(&right->bb_numrecs, 1); i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; if (level > 0) { lkp = XFS_BMAP_KEY_IADDR(left, i, cur); @@ -1619,7 +1574,7 @@ xfs_bmbt_split( rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1629,15 +1584,15 @@ xfs_bmbt_split( memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT); + *startoff = be64_to_cpu(rkp->br_startoff); } else { lrp = XFS_BMAP_REC_IADDR(left, i, cur); rrp = XFS_BMAP_REC_IADDR(right, 1, cur); memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp); + *startoff = xfs_bmbt_disk_get_startoff(rrp); } - be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); + be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); right->bb_rightsib = left->bb_rightsib; left->bb_rightsib = cpu_to_be64(args.fsbno); right->bb_leftsib = cpu_to_be64(lbno); @@ -1690,9 +1645,6 @@ xfs_bmbt_updkey( #ifdef DEBUG int error; #endif -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_updkey"; -#endif xfs_bmbt_key_t *kp; int ptr; @@ -1728,9 +1680,9 @@ xfs_bmdr_to_bmbt( { int dmxr; xfs_bmbt_key_t *fkp; - xfs_bmbt_ptr_t *fpp; + __be64 *fpp; xfs_bmbt_key_t *tkp; - xfs_bmbt_ptr_t *tpp; + __be64 *tpp; rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); rblock->bb_level = dblock->bb_level; @@ -1739,13 +1691,13 @@ xfs_bmdr_to_bmbt( rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO); rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO); dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); - fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); - memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* @@ -1761,9 +1713,6 @@ xfs_bmbt_decrement( xfs_bmbt_block_t *block; xfs_buf_t *bp; int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_decrement"; -#endif xfs_fsblock_t fsbno; int lev; xfs_mount_t *mp; @@ -1805,7 +1754,7 @@ xfs_bmbt_decrement( tp = cur->bc_tp; mp = cur->bc_mp; for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, XFS_BMAP_BTREE_REF))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); @@ -1834,9 +1783,6 @@ xfs_bmbt_delete( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_delete"; -#endif int i; int level; @@ -1870,7 +1816,7 @@ xfs_bmbt_delete( * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. */ -STATIC __inline__ void +STATIC_INLINE void __xfs_bmbt_get_all( __uint64_t l0, __uint64_t l1, @@ -1911,7 +1857,7 @@ __xfs_bmbt_get_all( void xfs_bmbt_get_all( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s) { __xfs_bmbt_get_all(r->l0, r->l1, s); @@ -1947,7 +1893,7 @@ xfs_bmbt_get_block( */ xfs_filblks_t xfs_bmbt_get_blockcount( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21)); } @@ -1957,7 +1903,7 @@ xfs_bmbt_get_blockcount( */ xfs_fsblock_t xfs_bmbt_get_startblock( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { #if XFS_BIG_BLKNOS return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) | @@ -1981,7 +1927,7 @@ xfs_bmbt_get_startblock( */ xfs_fileoff_t xfs_bmbt_get_startoff( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { return ((xfs_fileoff_t)r->l0 & XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; @@ -1989,7 +1935,7 @@ xfs_bmbt_get_startoff( xfs_exntst_t xfs_bmbt_get_state( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { int ext_flag; @@ -1998,19 +1944,13 @@ xfs_bmbt_get_state( ext_flag); } -#ifndef XFS_NATIVE_HOST /* Endian flipping versions of the bmbt extraction functions */ void xfs_bmbt_disk_get_all( xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s) { - __uint64_t l0, l1; - - l0 = INT_GET(r->l0, ARCH_CONVERT); - l1 = INT_GET(r->l1, ARCH_CONVERT); - - __xfs_bmbt_get_all(l0, l1, s); + __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s); } /* @@ -2020,31 +1960,7 @@ xfs_filblks_t xfs_bmbt_disk_get_blockcount( xfs_bmbt_rec_t *r) { - return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21)); -} - -/* - * Extract the startblock field from an on disk bmap extent record. - */ -xfs_fsblock_t -xfs_bmbt_disk_get_startblock( - xfs_bmbt_rec_t *r) -{ -#if XFS_BIG_BLKNOS - return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | - (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); -#else -#ifdef DEBUG - xfs_dfsbno_t b; - - b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | - (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); - return (xfs_fsblock_t)b; -#else /* !DEBUG */ - return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); -#endif /* DEBUG */ -#endif /* XFS_BIG_BLKNOS */ + return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21)); } /* @@ -2054,23 +1970,10 @@ xfs_fileoff_t xfs_bmbt_disk_get_startoff( xfs_bmbt_rec_t *r) { - return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) & + return ((xfs_fileoff_t)be64_to_cpu(r->l0) & XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; } -xfs_exntst_t -xfs_bmbt_disk_get_state( - xfs_bmbt_rec_t *r) -{ - int ext_flag; - - ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN)); - return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r), - ext_flag); -} -#endif /* XFS_NATIVE_HOST */ - - /* * Increment cursor by one record at the level. * For nonzero levels the leaf-ward information is untouched. @@ -2084,9 +1987,6 @@ xfs_bmbt_increment( xfs_bmbt_block_t *block; xfs_buf_t *bp; int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_increment"; -#endif xfs_fsblock_t fsbno; int lev; xfs_mount_t *mp; @@ -2135,7 +2035,7 @@ xfs_bmbt_increment( tp = cur->bc_tp; mp = cur->bc_mp; for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, XFS_BMAP_BTREE_REF))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); @@ -2157,6 +2057,10 @@ xfs_bmbt_increment( /* * Insert the current record at the point referenced by cur. + * + * A multi-level split of the tree on insert will invalidate the original + * cursor. All callers of this function should assume that the cursor is + * no longer valid and revalidate it. */ int /* error */ xfs_bmbt_insert( @@ -2164,9 +2068,6 @@ xfs_bmbt_insert( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_insert"; -#endif int i; int level; xfs_fsblock_t nbno; @@ -2178,7 +2079,7 @@ xfs_bmbt_insert( level = 0; nbno = NULLFSBLOCK; xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; pcur = cur; do { if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur, @@ -2195,8 +2096,7 @@ xfs_bmbt_insert( pcur->bc_private.b.allocated; pcur->bc_private.b.allocated = 0; ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) || - (cur->bc_private.b.ip->i_d.di_flags & - XFS_DIFLAG_REALTIME)); + XFS_IS_REALTIME_INODE(cur->bc_private.b.ip)); cur->bc_private.b.firstblock = pcur->bc_private.b.firstblock; ASSERT(cur->bc_private.b.flist == @@ -2205,7 +2105,7 @@ xfs_bmbt_insert( } if (ncur) { pcur = ncur; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; } } while (nbno != NULLFSBLOCK); XFS_BMBT_TRACE_CURSOR(cur, EXIT); @@ -2226,9 +2126,6 @@ xfs_bmbt_log_block( int fields) { int first; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_block"; -#endif int last; xfs_trans_t *tp; static const short offsets[] = { @@ -2265,9 +2162,6 @@ xfs_bmbt_log_recs( { xfs_bmbt_block_t *block; int first; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_recs"; -#endif int last; xfs_bmbt_rec_t *rp; xfs_trans_t *tp; @@ -2329,9 +2223,6 @@ xfs_bmbt_newroot( xfs_bmbt_key_t *ckp; /* child key pointer */ xfs_bmbt_ptr_t *cpp; /* child ptr pointer */ int error; /* error return code */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_newroot"; -#endif #ifdef DEBUG int i; /* loop counter */ #endif @@ -2356,14 +2247,16 @@ xfs_bmbt_newroot( args.firstblock = args.fsbno; if (args.fsbno == NULLFSBLOCK) { #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif - args.fsbno = INT_GET(*pp, ARCH_CONVERT); + args.fsbno = be64_to_cpu(*pp); args.type = XFS_ALLOCTYPE_START_BNO; - } else + } else if (cur->bc_private.b.flist->xbf_low) + args.type = XFS_ALLOCTYPE_START_BNO; + else args.type = XFS_ALLOCTYPE_NEAR_BNO; if ((error = xfs_alloc_vextent(&args))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); @@ -2383,7 +2276,7 @@ xfs_bmbt_newroot( bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0); cblock = XFS_BUF_TO_BMBT_BLOCK(bp); *cblock = *block; - be16_add(&block->bb_level, 1); + be16_add_cpu(&block->bb_level, 1); block->bb_numrecs = cpu_to_be16(1); cur->bc_nlevels++; cur->bc_ptrs[level + 1] = 1; @@ -2393,7 +2286,7 @@ xfs_bmbt_newroot( cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); #ifdef DEBUG for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -2401,13 +2294,12 @@ xfs_bmbt_newroot( #endif memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp)); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno, - level))) { + if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif - INT_SET(*pp, ARCH_CONVERT, args.fsbno); + *pp = cpu_to_be64(args.fsbno); xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs), cur->bc_private.b.whichfork); xfs_btree_setbuf(cur, level, bp); @@ -2426,185 +2318,131 @@ xfs_bmbt_newroot( } /* - * Set all the fields in a bmap extent record from the uncompressed form. - */ -void -xfs_bmbt_set_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - int extent_flag; - - ASSERT((s->br_state == XFS_EXT_NORM) || - (s->br_state == XFS_EXT_UNWRITTEN)); - extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1; - ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0); - ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0); -#if XFS_BIG_BLKNOS - ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0); - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - ((xfs_bmbt_rec_base_t)s->br_startblock >> 43); - r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); -#else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(s->br_startblock)) { - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); - r->l1 = XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); - } else { - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9); - r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); - } -#endif /* XFS_BIG_BLKNOS */ -} - -/* * Set all the fields in a bmap extent record from the arguments. */ void xfs_bmbt_set_allf( - xfs_bmbt_rec_t *r, - xfs_fileoff_t o, - xfs_fsblock_t b, - xfs_filblks_t c, - xfs_exntst_t v) + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) { - int extent_flag; + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN)); - extent_flag = (v == XFS_EXT_NORM) ? 0 : 1; - ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); #if XFS_BIG_BLKNOS - ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); + ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | - ((xfs_bmbt_rec_base_t)b >> 43); - r->l1 = ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(b)) { + if (ISNULLSTARTBLOCK(startblock)) { r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | + ((xfs_bmbt_rec_base_t)startoff << 9) | (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); r->l1 = XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); } else { r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9); - r->l1 = ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & + ((xfs_bmbt_rec_base_t)startoff << 9); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); } #endif /* XFS_BIG_BLKNOS */ } -#ifndef XFS_NATIVE_HOST /* * Set all the fields in a bmap extent record from the uncompressed form. */ void -xfs_bmbt_disk_set_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) +xfs_bmbt_set_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) { - int extent_flag; - - ASSERT((s->br_state == XFS_EXT_NORM) || - (s->br_state == XFS_EXT_UNWRITTEN)); - extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1; - ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0); - ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0); -#if XFS_BIG_BLKNOS - ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0); - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - ((xfs_bmbt_rec_base_t)s->br_startblock >> 43)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); -#else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(s->br_startblock)) { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); - INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); - } else { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); - } -#endif /* XFS_BIG_BLKNOS */ + xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); } + /* * Set all the fields in a disk format bmap extent record from the arguments. */ void xfs_bmbt_disk_set_allf( - xfs_bmbt_rec_t *r, - xfs_fileoff_t o, - xfs_fsblock_t b, - xfs_filblks_t c, - xfs_exntst_t v) + xfs_bmbt_rec_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) { - int extent_flag; + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN)); - extent_flag = (v == XFS_EXT_NORM) ? 0 : 1; - ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); #if XFS_BIG_BLKNOS - ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | - ((xfs_bmbt_rec_base_t)b >> 43)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(b)) { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | - (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); - INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & + if (ISNULLSTARTBLOCK(startblock)) { + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); + r->l1 = cpu_to_be64(XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); } else { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); } #endif /* XFS_BIG_BLKNOS */ } -#endif /* XFS_NATIVE_HOST */ + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +void +xfs_bmbt_disk_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); +} /* * Set the blockcount field in a bmap extent record. */ void xfs_bmbt_set_blockcount( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_filblks_t v) { ASSERT((v & XFS_MASK64HI(43)) == 0); @@ -2617,7 +2455,7 @@ xfs_bmbt_set_blockcount( */ void xfs_bmbt_set_startblock( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_fsblock_t v) { #if XFS_BIG_BLKNOS @@ -2645,7 +2483,7 @@ xfs_bmbt_set_startblock( */ void xfs_bmbt_set_startoff( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_fileoff_t v) { ASSERT((v & XFS_MASK64HI(9)) == 0); @@ -2659,7 +2497,7 @@ xfs_bmbt_set_startoff( */ void xfs_bmbt_set_state( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_exntst_t v) { ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); @@ -2681,9 +2519,9 @@ xfs_bmbt_to_bmdr( { int dmxr; xfs_bmbt_key_t *fkp; - xfs_bmbt_ptr_t *fpp; + __be64 *fpp; xfs_bmbt_key_t *tkp; - xfs_bmbt_ptr_t *tpp; + __be64 *tpp; ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC); ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO); @@ -2693,12 +2531,12 @@ xfs_bmbt_to_bmdr( dblock->bb_numrecs = rblock->bb_numrecs; dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); - tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); - memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* @@ -2715,9 +2553,6 @@ xfs_bmbt_update( xfs_bmbt_block_t *block; xfs_buf_t *bp; int error; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_update"; -#endif xfs_bmbt_key_t key; int ptr; xfs_bmbt_rec_t *rp; @@ -2740,7 +2575,7 @@ xfs_bmbt_update( XFS_BMBT_TRACE_CURSOR(cur, EXIT); return 0; } - INT_SET(key.br_startoff, ARCH_CONVERT, off); + key.br_startoff = cpu_to_be64(off); if ((error = xfs_bmbt_updkey(cur, &key, 1))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; @@ -2763,10 +2598,8 @@ xfs_check_nostate_extents( xfs_extnum_t idx, xfs_extnum_t num) { - xfs_bmbt_rec_t *ep; - for (; num > 0; num--, idx++) { - ep = xfs_iext_get_ext(ifp, idx); + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); if ((ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { ASSERT(0); diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 6478cfa..cd0d4b4 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -35,45 +35,16 @@ typedef struct xfs_bmdr_block { /* * Bmap btree record and extent descriptor. - * For 32-bit kernels, - * l0:31 is an extent flag (value 1 indicates non-normal). - * l0:0-30 and l1:9-31 are startoff. - * l1:0-8, l2:0-31, and l3:21-31 are startblock. - * l3:0-20 are blockcount. - * For 64-bit kernels, * l0:63 is an extent flag (value 1 indicates non-normal). * l0:9-62 are startoff. * l0:0-8 and l1:21-63 are startblock. * l1:0-20 are blockcount. */ - -#ifndef XFS_NATIVE_HOST - -#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ -#define BMBT_EXNTFLAG_BITOFF 0 -#define BMBT_EXNTFLAG_BITLEN 1 -#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN) -#define BMBT_STARTOFF_BITLEN 54 -#define BMBT_STARTBLOCK_BITOFF (BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN) -#define BMBT_STARTBLOCK_BITLEN 52 -#define BMBT_BLOCKCOUNT_BITOFF \ - (BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN) -#define BMBT_BLOCKCOUNT_BITLEN (BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF) - -#else - -#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ -#define BMBT_EXNTFLAG_BITOFF 63 #define BMBT_EXNTFLAG_BITLEN 1 -#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN) #define BMBT_STARTOFF_BITLEN 54 -#define BMBT_STARTBLOCK_BITOFF 85 /* 128 - 43 (other 9 is in first word) */ #define BMBT_STARTBLOCK_BITLEN 52 -#define BMBT_BLOCKCOUNT_BITOFF 64 /* Start of second 64 bit container */ #define BMBT_BLOCKCOUNT_BITLEN 21 -#endif /* XFS_NATIVE_HOST */ - #define BMBT_USE_64 1 @@ -83,12 +54,16 @@ typedef struct xfs_bmbt_rec_32 } xfs_bmbt_rec_32_t; typedef struct xfs_bmbt_rec_64 { - __uint64_t l0, l1; + __be64 l0, l1; } xfs_bmbt_rec_64_t; typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t; +typedef struct xfs_bmbt_rec_host { + __uint64_t l0, l1; +} xfs_bmbt_rec_host_t; + /* * Values and macros for delayed-allocation startblock fields. */ @@ -145,7 +120,7 @@ typedef enum { * Extent state and extent format macros. */ #define XFS_EXTFMT_INODE(x) \ - (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \ + (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) #define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) @@ -163,30 +138,23 @@ typedef struct xfs_bmbt_irec /* * Key structure for non-leaf levels of the tree. */ -typedef struct xfs_bmbt_key -{ - xfs_dfiloff_t br_startoff; /* starting file offset */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ } xfs_bmbt_key_t, xfs_bmdr_key_t; -typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* btree pointer type */ - /* btree block header type */ +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + +/* btree block header type */ typedef struct xfs_btree_lblock xfs_bmbt_block_t; #define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp)) -#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) #define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize) #define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \ ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \ (cur)->bc_private.b.whichfork)->if_broot_bytes) -#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BMAP_RBLOCK_DSIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur))) -#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BMAP_RBLOCK_ISIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur))) - #define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \ (((lev) == (cur)->bc_nlevels - 1 ? \ XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ @@ -209,37 +177,21 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t; xfs_bmbt, (lev) == 0) : \ ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) -#define XFS_BMAP_REC_DADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) -#define XFS_BMAP_REC_IADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) +#define XFS_BMAP_REC_DADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) + +#define XFS_BMAP_REC_IADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) #define XFS_BMAP_KEY_DADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) + (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) + #define XFS_BMAP_KEY_IADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) + (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) #define XFS_BMAP_PTR_DADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ be16_to_cpu((bb)->bb_level), cur))) #define XFS_BMAP_PTR_IADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ be16_to_cpu((bb)->bb_level), cur))) /* @@ -247,11 +199,11 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t; * we don't have a cursor. */ #define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \ - (XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) + (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i)) #define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \ - (XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) + (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i)) #define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \ - (XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) + (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) #define XFS_BMAP_BROOT_NUMRECS(bb) be16_to_cpu((bb)->bb_numrecs) #define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0) @@ -304,27 +256,17 @@ extern ktrace_t *xfs_bmbt_trace_buf; extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int); extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *); extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *); -extern void xfs_bmbt_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); +extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur, int, struct xfs_buf **bpp); -extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_t *r); -extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_t *r); -extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_t *r); +extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); +extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); +extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); +extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); -#ifndef XFS_NATIVE_HOST extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); -extern xfs_exntst_t xfs_bmbt_disk_get_state(xfs_bmbt_rec_t *r); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fsblock_t xfs_bmbt_disk_get_startblock(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); -#else -#define xfs_bmbt_disk_get_all(r, s) xfs_bmbt_get_all(r, s) -#define xfs_bmbt_disk_get_state(r) xfs_bmbt_get_state(r) -#define xfs_bmbt_disk_get_blockcount(r) xfs_bmbt_get_blockcount(r) -#define xfs_bmbt_disk_get_startblock(r) xfs_bmbt_get_blockcount(r) -#define xfs_bmbt_disk_get_startoff(r) xfs_bmbt_get_startoff(r) -#endif /* XFS_NATIVE_HOST */ extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *); extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *); @@ -342,36 +284,22 @@ extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t, */ extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat); -extern void xfs_bmbt_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); -extern void xfs_bmbt_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, +extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); -extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_t *r, xfs_filblks_t v); -extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_t *r, xfs_fsblock_t v); -extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_t *r, xfs_fileoff_t v); -extern void xfs_bmbt_set_state(xfs_bmbt_rec_t *r, xfs_exntst_t v); +extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v); +extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); +extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); +extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); -#ifndef XFS_NATIVE_HOST extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); -#else -#define xfs_bmbt_disk_set_all(r, s) xfs_bmbt_set_all(r, s) -#define xfs_bmbt_disk_set_allf(r, o, b, c, v) xfs_bmbt_set_allf(r, o, b, c, v) -#endif /* XFS_NATIVE_HOST */ extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int); extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t, xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t); -#ifdef DEBUG -/* - * Get the data from the pointed-to record. - */ -extern int xfs_bmbt_get_rec(struct xfs_btree_cur *, xfs_fileoff_t *, - xfs_fsblock_t *, xfs_filblks_t *, - xfs_exntst_t *, int *); -#endif - #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index ee2255b..cc593a8 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -46,38 +46,11 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = -{ +const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }; /* - * Prototypes for internal routines. - */ - -/* - * Checking routine: return maxrecs for the block. - */ -STATIC int /* number of records fitting in block */ -xfs_btree_maxrecs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block);/* generic btree block pointer */ - -/* - * Internal routines. - */ - -/* - * Retrieve the block pointer from the cursor at the given level. - * This may be a bmap btree root or from a buffer. - */ -STATIC xfs_btree_block_t * /* generic btree block pointer */ -xfs_btree_get_block( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree */ - struct xfs_buf **bpp); /* buffer containing the block */ - -/* * Checking routine: return maxrecs for the block. */ STATIC int /* number of records fitting in block */ @@ -161,7 +134,7 @@ xfs_btree_check_key( k1 = ak1; k2 = ak2; - ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT)); + ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff)); break; } case XFS_BTNUM_INO: { @@ -170,7 +143,7 @@ xfs_btree_check_key( k1 = ak1; k2 = ak2; - ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT)); + ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino)); break; } default: @@ -285,8 +258,8 @@ xfs_btree_check_rec( r1 = ar1; r2 = ar2; - ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <= - INT_GET(r2->ir_startino, ARCH_CONVERT)); + ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <= + be32_to_cpu(r2->ir_startino)); break; } default: @@ -457,35 +430,6 @@ xfs_btree_dup_cursor( } /* - * Change the cursor to point to the first record at the given level. - * Other levels are unaffected. - */ -int /* success=1, failure=0 */ -xfs_btree_firstrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level) /* level to change */ -{ - xfs_btree_block_t *block; /* generic btree block pointer */ - xfs_buf_t *bp; /* buffer containing block */ - - /* - * Get the block pointer for this level. - */ - block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); - /* - * It's empty, there is no such record. - */ - if (!block->bb_h.bb_numrecs) - return 0; - /* - * Set the ptr value to 1, that's the first record/key. - */ - cur->bc_ptrs[level] = 1; - return 1; -} - -/* * Retrieve the block pointer from the cursor at the given level. * This may be a bmap btree root or from a buffer. */ @@ -626,6 +570,13 @@ xfs_btree_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; break; + case XFS_BTNUM_INO: + /* + * Inode allocation btree fields. + */ + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + break; case XFS_BTNUM_BMAP: /* * Bmap btree fields. @@ -638,13 +589,6 @@ xfs_btree_init_cursor( cur->bc_private.b.flags = 0; cur->bc_private.b.whichfork = whichfork; break; - case XFS_BTNUM_INO: - /* - * Inode allocation btree fields. - */ - cur->bc_private.i.agbp = agbp; - cur->bc_private.i.agno = agno; - break; default: ASSERT(0); } @@ -671,6 +615,35 @@ xfs_btree_islastblock( } /* + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_h.bb_numrecs) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; +} + +/* * Change the cursor to point to the last record in the current block * at the given level. Other levels are unaffected. */ @@ -890,12 +863,12 @@ xfs_btree_readahead_core( case XFS_BTNUM_INO: i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, be32_to_cpu(i->bb_leftsib), 1); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, be32_to_cpu(i->bb_rightsib), 1); rval++; } diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 44f1bd9..1f528a2 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -24,6 +24,8 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; +extern kmem_zone_t *xfs_btree_cur_zone; + /* * This nonsense is to make -wlint happy. */ @@ -122,13 +124,13 @@ extern const __uint32_t xfs_magics[]; * Given block size, type prefix, block pointer, and index of requested entry * (first entry numbered 1). */ -#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr) \ +#define XFS_BTREE_REC_ADDR(t,bb,i) \ ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \ ((i) - 1) * sizeof(t ## _rec_t))) -#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr) \ +#define XFS_BTREE_KEY_ADDR(t,bb,i) \ ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \ ((i) - 1) * sizeof(t ## _key_t))) -#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr) \ +#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr) \ ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \ (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t))) @@ -145,7 +147,7 @@ typedef struct xfs_btree_cur union { xfs_alloc_rec_incore_t a; xfs_bmbt_irec_t b; - xfs_inobt_rec_t i; + xfs_inobt_rec_incore_t i; } bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ @@ -156,8 +158,8 @@ typedef struct xfs_btree_cur __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ union { - struct { /* needed for BNO, CNT */ - struct xfs_buf *agbp; /* agf buffer pointer */ + struct { /* needed for BNO, CNT, INO */ + struct xfs_buf *agbp; /* agf/agi buffer pointer */ xfs_agnumber_t agno; /* ag number */ } a; struct { /* needed for BMAP */ @@ -170,10 +172,6 @@ typedef struct xfs_btree_cur char flags; /* flags */ #define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ } b; - struct { /* needed for INO */ - struct xfs_buf *agbp; /* agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - } i; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; @@ -243,6 +241,9 @@ xfs_btree_check_lptr( xfs_dfsbno_t ptr, /* btree block disk address */ int level); /* btree block level */ +#define xfs_btree_check_lptr_disk(cur, ptr, level) \ + xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level) + /* * Checking routine: check that short form block header is ok. */ @@ -441,30 +442,14 @@ xfs_btree_setbuf( /* * Min and max functions for extlen, agblock, fileoff, and filblks types. */ -#define XFS_EXTLEN_MIN(a,b) \ - ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \ - (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) -#define XFS_EXTLEN_MAX(a,b) \ - ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \ - (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) -#define XFS_AGBLOCK_MIN(a,b) \ - ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \ - (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) -#define XFS_AGBLOCK_MAX(a,b) \ - ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \ - (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) -#define XFS_FILEOFF_MIN(a,b) \ - ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \ - (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) -#define XFS_FILEOFF_MAX(a,b) \ - ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \ - (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) -#define XFS_FILBLKS_MIN(a,b) \ - ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \ - (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) -#define XFS_FILBLKS_MAX(a,b) \ - ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \ - (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) +#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) +#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) +#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) +#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) +#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) +#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a4aa539..002fc26 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -23,6 +23,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_buf_item.h" @@ -234,7 +235,6 @@ xfs_buf_item_format( ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); bp = bip->bli_buf; - ASSERT(XFS_BUF_BP_ISMAPPED(bp)); vecp = log_vector; /* @@ -378,7 +378,6 @@ xfs_buf_item_unpin( xfs_mount_t *mp; xfs_buf_t *bp; int freed; - SPLDECL(s); bp = bip->bli_buf; ASSERT(bp != NULL); @@ -409,8 +408,8 @@ xfs_buf_item_unpin( XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); } else { - AIL_LOCK(mp,s); - xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); + spin_lock(&mp->m_ail_lock); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); xfs_buf_item_relse(bp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); } @@ -581,8 +580,8 @@ xfs_buf_item_unlock( * If the buf item isn't tracking any data, free it. * Otherwise, if XFS_BLI_HOLD is set clear it. */ - if (xfs_count_bits(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0) == 0) { + if (xfs_bitmap_empty(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size)) { xfs_buf_item_relse(bp); } else if (hold) { bip->bli_flags &= ~XFS_BLI_HOLD; @@ -628,25 +627,6 @@ xfs_buf_item_committed( } /* - * This is called when the transaction holding the buffer is aborted. - * Just behave as if the transaction had been cancelled. If we're shutting down - * and have aborted this transaction, we'll trap this buffer when it tries to - * get written out. - */ -STATIC void -xfs_buf_item_abort( - xfs_buf_log_item_t *bip) -{ - xfs_buf_t *bp; - - bp = bip->bli_buf; - xfs_buftrace("XFS_ABORT", bp); - XFS_BUF_SUPER_STALE(bp); - xfs_buf_item_unlock(bip); - return; -} - -/* * This is called to asynchronously write the buffer associated with this * buf log item out to disk. The buffer will already have been locked by * a successful call to xfs_buf_item_trylock(). If the buffer still has @@ -665,7 +645,12 @@ xfs_buf_item_push( bp = bip->bli_buf; if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_bawrite(bip->bli_item.li_mountp, bp); + int error; + error = xfs_bawrite(bip->bli_item.li_mountp, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, + "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", + error, bip, bp); } else { xfs_buf_relse(bp); } @@ -680,7 +665,7 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) /* * This is the ops vector shared by all buf log items. */ -STATIC struct xfs_item_ops xfs_buf_item_ops = { +static struct xfs_item_ops xfs_buf_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_buf_item_format, @@ -693,7 +678,6 @@ STATIC struct xfs_item_ops xfs_buf_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_buf_item_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committing @@ -748,12 +732,13 @@ xfs_buf_item_init( bip->bli_item.li_ops = &xfs_buf_item_ops; bip->bli_item.li_mountp = mp; bip->bli_buf = bp; + xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); bip->bli_format.blf_map_size = map_size; #ifdef XFS_BLI_TRACE - bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP); + bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS); #endif #ifdef XFS_TRANS_DEBUG @@ -883,6 +868,21 @@ xfs_buf_item_dirty( return (bip->bli_flags & XFS_BLI_DIRTY); } +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig); + kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + /* * This is called when the buf log item is no longer needed. It should * free the buf log item associated with the given buffer and clear @@ -901,21 +901,10 @@ xfs_buf_item_relse( XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && (XFS_BUF_IODONE_FUNC(bp) != NULL)) { - ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0); XFS_BUF_CLR_IODONE_FUNC(bp); } - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_rele(bp); + xfs_buf_item_free(bip); } @@ -1073,7 +1062,7 @@ xfs_buf_iodone_callbacks( anyway. */ XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); XFS_BUF_DONE(bp); - XFS_BUF_V_IODONESEMA(bp); + XFS_BUF_FINISH_IOWAIT(bp); } return; } @@ -1134,10 +1123,10 @@ xfs_buf_iodone( xfs_buf_log_item_t *bip) { struct xfs_mount *mp; - SPLDECL(s); ASSERT(bip->bli_buf == bp); + xfs_buf_rele(bp); mp = bip->bli_item.li_mountp; /* @@ -1149,23 +1138,12 @@ xfs_buf_iodone( * * Either way, AIL is useless if we're forcing a shutdown. */ - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); /* * xfs_trans_delete_ail() drops the AIL lock. */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); + xfs_buf_item_free(bip); } #if defined(XFS_BLI_TRACE) diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 07c708c..5a41c34 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -18,26 +18,12 @@ #ifndef __XFS_BUF_ITEM_H__ #define __XFS_BUF_ITEM_H__ +extern kmem_zone_t *xfs_buf_item_zone; + /* * This is the structure used to lay out a buf log item in the * log. The data map describes which 128 byte chunks of the buffer - * have been logged. This structure works only on buffers that - * reside up to the first TB in the filesystem. These buffers are - * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF. - */ -typedef struct xfs_buf_log_format_v1 { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - __int32_t blf_blkno; /* starting blkno of this buf */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - unsigned int blf_map_size; /* size of data bitmap in words */ - unsigned int blf_data_map[1];/* variable size bitmap of */ - /* regions of buffer in this item */ -} xfs_buf_log_format_v1_t; - -/* - * This is a form of the above structure with a 64 bit blkno field. + * have been logged. * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. */ typedef struct xfs_buf_log_format_t { diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h deleted file mode 100644 index 7a0e482..0000000 --- a/fs/xfs/xfs_cap.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CAP_H__ -#define __XFS_CAP_H__ - -/* - * Capabilities - */ -typedef __uint64_t xfs_cap_value_t; - -typedef struct xfs_cap_set { - xfs_cap_value_t cap_effective; /* use in capability checks */ - xfs_cap_value_t cap_permitted; /* combined with file attrs */ - xfs_cap_value_t cap_inheritable;/* pass through exec */ -} xfs_cap_set_t; - -/* On-disk XFS extended attribute names */ -#define SGI_CAP_FILE "SGI_CAP_FILE" -#define SGI_CAP_FILE_SIZE (sizeof(SGI_CAP_FILE)-1) -#define SGI_CAP_LINUX "SGI_CAP_LINUX" -#define SGI_CAP_LINUX_SIZE (sizeof(SGI_CAP_LINUX)-1) - -/* - * For Linux, we take the bitfields directly from capability.h - * and no longer attempt to keep this attribute ondisk compatible - * with IRIX. Since this attribute is only set on executables, - * it just doesn't make much sense to try. We do use a different - * named attribute though, to avoid confusion. - */ - -#ifdef __KERNEL__ - -#ifdef CONFIG_FS_POSIX_CAP - -#include <linux/posix_cap_xattr.h> - -struct bhv_vnode; - -extern int xfs_cap_vhascap(struct bhv_vnode *); -extern int xfs_cap_vset(struct bhv_vnode *, void *, size_t); -extern int xfs_cap_vget(struct bhv_vnode *, void *, size_t); -extern int xfs_cap_vremove(struct bhv_vnode *); - -#define _CAP_EXISTS xfs_cap_vhascap - -#else -#define xfs_cap_vset(v,p,sz) (-EOPNOTSUPP) -#define xfs_cap_vget(v,p,sz) (-EOPNOTSUPP) -#define xfs_cap_vremove(v) (-EOPNOTSUPP) -#define _CAP_EXISTS (NULL) -#endif - -#endif /* __KERNEL__ */ - -#endif /* __XFS_CAP_H__ */ diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index 5b7eb81..d2ce5dd 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -78,6 +78,7 @@ struct xfs_mount_args { #define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */ #define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */ /* (osyncisdsync is default) */ +#define XFSMNT_NOATTR2 0x00008000 /* turn off ATTR2 EA format */ #define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32 * bits of address space */ #define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */ @@ -86,10 +87,9 @@ struct xfs_mount_args { #define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */ #define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */ #define XFSMNT_BARRIER 0x04000000 /* use write barriers */ -#define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */ +#define XFSMNT_IKEEP 0x08000000 /* inode cluster delete */ #define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width * allocation */ -#define XFSMNT_IHASHSIZE 0x20000000 /* inode hash table size */ #define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename * symlink,mkdir,rmdir,mknod */ #define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */ @@ -99,5 +99,7 @@ struct xfs_mount_args { */ #define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred * I/O size in stat(2) */ +#define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams + * allocator */ #endif /* __XFS_CLNT_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 32ab61d..a11a839 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -511,12 +511,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Move the req'd B-tree elements from high in node1 to * low in node2. */ - be16_add(&node2->hdr.count, count); + be16_add_cpu(&node2->hdr.count, count); tmp = count * (uint)sizeof(xfs_da_node_entry_t); btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count]; btree_d = &node2->btree[0]; memcpy(btree_d, btree_s, tmp); - be16_add(&node1->hdr.count, -count); + be16_add_cpu(&node1->hdr.count, -count); } else { /* * Move the req'd B-tree elements from low in node2 to @@ -527,7 +527,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, btree_s = &node2->btree[0]; btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; memcpy(btree_d, btree_s, tmp); - be16_add(&node1->hdr.count, count); + be16_add_cpu(&node1->hdr.count, count); xfs_da_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); @@ -539,7 +539,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, btree_s = &node2->btree[count]; btree_d = &node2->btree[0]; memmove(btree_d, btree_s, tmp); - be16_add(&node2->hdr.count, -count); + be16_add_cpu(&node2->hdr.count, -count); } /* @@ -604,7 +604,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, btree->before = cpu_to_be32(newblk->blkno); xfs_da_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); - be16_add(&node->hdr.count, 1); + be16_add_cpu(&node->hdr.count, 1); xfs_da_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); @@ -959,7 +959,7 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); xfs_da_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); - be16_add(&node->hdr.count, -1); + be16_add_cpu(&node->hdr.count, -1); xfs_da_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); @@ -1018,7 +1018,7 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, */ tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); memcpy(btree, &drop_node->btree[0], tmp); - be16_add(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); + be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); xfs_da_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, @@ -1054,7 +1054,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) xfs_da_node_entry_t *btree; xfs_dablk_t blkno; int probe, span, max, error, retval; - xfs_dahash_t hashval; + xfs_dahash_t hashval, btreehashval; xfs_da_args_t *args; args = state->args; @@ -1079,30 +1079,31 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) return(error); } curr = blk->bp->data; - ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC || - be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC || - be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC); + blk->magic = be16_to_cpu(curr->magic); + ASSERT(blk->magic == XFS_DA_NODE_MAGIC || + blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_ATTR_LEAF_MAGIC); /* * Search an intermediate node for a match. */ - blk->magic = be16_to_cpu(curr->magic); if (blk->magic == XFS_DA_NODE_MAGIC) { node = blk->bp->data; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + max = be16_to_cpu(node->hdr.count); + blk->hashval = be32_to_cpu(node->btree[max-1].hashval); /* * Binary search. (note: small blocks will skip loop) */ - max = be16_to_cpu(node->hdr.count); probe = span = max / 2; hashval = args->hashval; for (btree = &node->btree[probe]; span > 4; btree = &node->btree[probe]) { span /= 2; - if (be32_to_cpu(btree->hashval) < hashval) + btreehashval = be32_to_cpu(btree->hashval); + if (btreehashval < hashval) probe += span; - else if (be32_to_cpu(btree->hashval) > hashval) + else if (btreehashval > hashval) probe -= span; else break; @@ -1133,10 +1134,10 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) blk->index = probe; blkno = be32_to_cpu(btree->before); } - } else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) { + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); break; - } else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) { + } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); break; } @@ -1152,11 +1153,13 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { retval = xfs_dir2_leafn_lookup_int(blk->bp, args, &blk->index, state); - } - else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { retval = xfs_attr_leaf_lookup_int(blk->bp, args); blk->index = args->index; args->blkno = blk->blkno; + } else { + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); } if (((retval == ENOENT) || (retval == ENOATTR)) && (blk->hashval == args->hashval)) { @@ -1166,8 +1169,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) return(error); if (retval == 0) { continue; - } - else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { /* path_shift() gives ENOENT */ retval = XFS_ERROR(ENOATTR); } @@ -1429,7 +1431,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, } if (level < 0) { *result = XFS_ERROR(ENOENT); /* we're out of our tree */ - ASSERT(args->oknoent); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); return(0); } @@ -1528,6 +1530,28 @@ xfs_da_hashname(const uchar_t *name, int namelen) } } +enum xfs_dacmp +xfs_da_compname( + struct xfs_da_args *args, + const char *name, + int len) +{ + return (args->namelen == len && memcmp(args->name, name, len) == 0) ? + XFS_CMP_EXACT : XFS_CMP_DIFFERENT; +} + +static xfs_dahash_t +xfs_default_hashname( + struct xfs_name *name) +{ + return xfs_da_hashname(name->name, name->len); +} + +const struct xfs_nameops xfs_default_nameops = { + .hashname = xfs_default_hashname, + .compname = xfs_da_compname +}; + /* * Add a block to the btree ahead of the file. * Return the new block number to the caller. @@ -1542,11 +1566,14 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) int nmap, error, w, count, c, got, i, mapi; xfs_trans_t *tp; xfs_mount_t *mp; + xfs_drfsbno_t nblks; dp = args->dp; mp = dp->i_mount; w = args->whichfork; tp = args->trans; + nblks = dp->i_d.di_nblocks; + /* * For new directories adjust the file offset and block count. */ @@ -1596,7 +1623,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) args->firstblock, args->total, &mapp[mapi], &nmap, args->flist, NULL))) { - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return error; } if (nmap < 1) @@ -1618,11 +1645,13 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != bno + count) { if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return XFS_ERROR(ENOSPC); } if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; *new_blkno = (xfs_dablk_t)bno; return 0; } @@ -1973,7 +2002,6 @@ xfs_da_do_buf( error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); if (unlikely(error == EFSCORRUPTED)) { if (xfs_error_level >= XFS_ERRLEVEL_LOW) { - int i; cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n", (long long)bno); cmn_err(CE_ALERT, "dir: inode %lld\n", @@ -2089,10 +2117,10 @@ xfs_da_do_buf( } } if (bplist) { - kmem_free(bplist, sizeof(*bplist) * nmap); + kmem_free(bplist); } if (mapp != &map) { - kmem_free(mapp, sizeof(*mapp) * nfsb); + kmem_free(mapp); } if (bpp) *bpp = rbp; @@ -2101,11 +2129,11 @@ exit1: if (bplist) { for (i = 0; i < nbplist; i++) xfs_trans_brelse(trans, bplist[i]); - kmem_free(bplist, sizeof(*bplist) * nmap); + kmem_free(bplist); } exit0: if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * nfsb); + kmem_free(mapp); if (bpp) *bpp = NULL; return error; @@ -2163,21 +2191,6 @@ xfs_da_reada_buf( return rval; } -/* - * Calculate the number of bits needed to hold i different values. - */ -uint -xfs_da_log2_roundup(uint i) -{ - uint rval; - - for (rval = 0; rval < NBBY * sizeof(i); rval++) { - if ((1 << rval) >= i) - break; - } - return(rval); -} - kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ @@ -2232,7 +2245,7 @@ xfs_da_state_free(xfs_da_state_t *state) #ifdef XFS_DABUF_DEBUG xfs_dabuf_t *xfs_dabuf_global_list; -lock_t xfs_dabuf_global_lock; +static DEFINE_SPINLOCK(xfs_dabuf_global_lock); #endif /* @@ -2278,10 +2291,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) } #ifdef XFS_DABUF_DEBUG { - SPLDECL(s); xfs_dabuf_t *p; - s = mutex_spinlock(&xfs_dabuf_global_lock); + spin_lock(&xfs_dabuf_global_lock); for (p = xfs_dabuf_global_list; p; p = p->next) { ASSERT(p->blkno != dabuf->blkno || p->target != dabuf->target); @@ -2291,7 +2303,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) xfs_dabuf_global_list->prev = dabuf; dabuf->next = xfs_dabuf_global_list; xfs_dabuf_global_list = dabuf; - mutex_spinunlock(&xfs_dabuf_global_lock, s); + spin_unlock(&xfs_dabuf_global_lock); } #endif return dabuf; @@ -2330,26 +2342,24 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf) if (dabuf->dirty) xfs_da_buf_clean(dabuf); if (dabuf->nbuf > 1) - kmem_free(dabuf->data, BBTOB(dabuf->bbcount)); + kmem_free(dabuf->data); #ifdef XFS_DABUF_DEBUG { - SPLDECL(s); - - s = mutex_spinlock(&xfs_dabuf_global_lock); + spin_lock(&xfs_dabuf_global_lock); if (dabuf->prev) dabuf->prev->next = dabuf->next; else xfs_dabuf_global_list = dabuf->next; if (dabuf->next) dabuf->next->prev = dabuf->prev; - mutex_spinunlock(&xfs_dabuf_global_lock, s); + spin_unlock(&xfs_dabuf_global_lock); } memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf)); #endif if (dabuf->nbuf == 1) kmem_zone_free(xfs_dabuf_zone, dabuf); else - kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf)); + kmem_free(dabuf); } /* @@ -2420,7 +2430,7 @@ xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) for (i = 0; i < nbuf; i++) xfs_trans_brelse(tp, bplist[i]); if (bplist != &bp) - kmem_free(bplist, nbuf * sizeof(*bplist)); + kmem_free(bplist); } /* @@ -2446,7 +2456,7 @@ xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) for (i = 0; i < nbuf; i++) xfs_trans_binval(tp, bplist[i]); if (bplist != &bp) - kmem_free(bplist, nbuf * sizeof(*bplist)); + kmem_free(bplist); } /* diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 4ab865e..8be0b00 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -99,6 +99,15 @@ typedef struct xfs_da_node_entry xfs_da_node_entry_t; *========================================================================*/ /* + * Search comparison results + */ +enum xfs_dacmp { + XFS_CMP_DIFFERENT, /* names are completely different */ + XFS_CMP_EXACT, /* names are exactly the same */ + XFS_CMP_CASE /* names are same but differ in case */ +}; + +/* * Structure to ease passing around component names. */ typedef struct xfs_da_args { @@ -123,13 +132,20 @@ typedef struct xfs_da_args { int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ - unsigned char justcheck; /* T/F: check for ok with no space */ - unsigned char rename; /* T/F: this is an atomic rename op */ - unsigned char addname; /* T/F: this is an add operation */ - unsigned char oknoent; /* T/F: ok to return ENOENT, else die */ + int op_flags; /* operation flags */ + enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* + * Operation flags: + */ +#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ +#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ +#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ +#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ + +/* * Structure to describe buffer(s) for a block. * This is needed in the directory version 2 format case, when * multiple non-contiguous fsblocks might be needed to cover one @@ -201,6 +217,14 @@ typedef struct xfs_da_state { (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) +/* + * Name ops for directory and/or attr name operations + */ +struct xfs_nameops { + xfs_dahash_t (*hashname)(struct xfs_name *); + enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); +}; + #ifdef __KERNEL__ /*======================================================================== @@ -249,7 +273,10 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, xfs_dabuf_t *dead_buf); uint xfs_da_hashname(const uchar_t *name_string, int name_length); -uint xfs_da_log2_roundup(uint i); +enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, + const char *name, int len); + + xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); @@ -261,6 +288,7 @@ void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); extern struct kmem_zone *xfs_da_state_zone; +extern struct kmem_zone *xfs_dabuf_zone; #endif /* __KERNEL__ */ #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 80562b6..75b0cd4 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -41,8 +41,8 @@ #include "xfs_itable.h" #include "xfs_dfrag.h" #include "xfs_error.h" -#include "xfs_mac.h" #include "xfs_rw.h" +#include "xfs_vnodeops.h" /* * Syssgi interface for swapext @@ -52,76 +52,72 @@ xfs_swapext( xfs_swapext_t __user *sxu) { xfs_swapext_t *sxp; - xfs_inode_t *ip=NULL, *tip=NULL; - xfs_mount_t *mp; - struct file *fp = NULL, *tfp = NULL; - bhv_vnode_t *vp, *tvp; + xfs_inode_t *ip, *tip; + struct file *file, *target_file; int error = 0; sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL); if (!sxp) { error = XFS_ERROR(ENOMEM); - goto error0; + goto out; } if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) { error = XFS_ERROR(EFAULT); - goto error0; + goto out_free_sxp; } /* Pull information for the target fd */ - if (((fp = fget((int)sxp->sx_fdtarget)) == NULL) || - ((vp = vn_from_inode(fp->f_dentry->d_inode)) == NULL)) { + file = fget((int)sxp->sx_fdtarget); + if (!file) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_free_sxp; } - ip = xfs_vtoi(vp); - if (ip == NULL) { + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); - goto error0; + goto out_put_file; } - if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) || - ((tvp = vn_from_inode(tfp->f_dentry->d_inode)) == NULL)) { + target_file = fget((int)sxp->sx_fdtmp); + if (!target_file) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_put_file; } - tip = xfs_vtoi(tvp); - if (tip == NULL) { + if (!(target_file->f_mode & FMODE_WRITE) || + (target_file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); - goto error0; + goto out_put_target_file; } + ip = XFS_I(file->f_path.dentry->d_inode); + tip = XFS_I(target_file->f_path.dentry->d_inode); + if (ip->i_mount != tip->i_mount) { - error = XFS_ERROR(EINVAL); - goto error0; + error = XFS_ERROR(EINVAL); + goto out_put_target_file; } if (ip->i_ino == tip->i_ino) { - error = XFS_ERROR(EINVAL); - goto error0; + error = XFS_ERROR(EINVAL); + goto out_put_target_file; } - mp = ip->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) { - error = XFS_ERROR(EIO); - goto error0; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = XFS_ERROR(EIO); + goto out_put_target_file; } - error = XFS_SWAP_EXTENTS(mp, &ip->i_iocore, &tip->i_iocore, sxp); - - error0: - if (fp != NULL) - fput(fp); - if (tfp != NULL) - fput(tfp); - - if (sxp != NULL) - kmem_free(sxp, sizeof(xfs_swapext_t)); + error = xfs_swap_extents(ip, tip, sxp); + out_put_target_file: + fput(target_file); + out_put_file: + fput(file); + out_free_sxp: + kmem_free(sxp); + out: return error; } @@ -132,10 +128,8 @@ xfs_swap_extents( xfs_swapext_t *sxp) { xfs_mount_t *mp; - xfs_inode_t *ips[2]; xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; - bhv_vnode_t *vp, *tvp; xfs_ifork_t *tempifp, *ifp, *tifp; int ilf_fields, tilf_fields; static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; @@ -154,30 +148,17 @@ xfs_swap_extents( } sbp = &sxp->sx_stat; - vp = XFS_ITOV(ip); - tvp = XFS_ITOV(tip); - - /* Lock in i_ino order */ - if (ip->i_ino < tip->i_ino) { - ips[0] = ip; - ips[1] = tip; - } else { - ips[0] = tip; - ips[1] = ip; - } - xfs_lock_inodes(ips, 2, 0, lock_flags); + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); locked = 1; - /* Check permissions */ - error = xfs_iaccess(ip, S_IWUSR, NULL); - if (error) - goto error0; - - error = xfs_iaccess(tip, S_IWUSR, NULL); - if (error) - goto error0; - /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { error = XFS_ERROR(EINVAL); @@ -185,8 +166,7 @@ xfs_swap_extents( } /* Verify both files are either real-time or non-realtime */ - if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != - (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { error = XFS_ERROR(EINVAL); goto error0; } @@ -198,13 +178,16 @@ xfs_swap_extents( goto error0; } - if (VN_CACHED(tvp) != 0) { - xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1); - bhv_vop_flushinval_pages(tvp, 0, -1, FI_REMAPF_LOCKED); + if (VN_CACHED(VFS_I(tip)) != 0) { + xfs_inval_cached_trace(tip, 0, -1, 0, -1); + error = xfs_flushinval_pages(tip, 0, -1, + FI_REMAPF_LOCKED); + if (error) + goto error0; } /* Verify O_DIRECT for ftmp */ - if (VN_CACHED(tvp) != 0) { + if (VN_CACHED(VFS_I(tip)) != 0) { error = XFS_ERROR(EINVAL); goto error0; } @@ -248,7 +231,7 @@ xfs_swap_extents( * vop_read (or write in the case of autogrow) they block on the iolock * until we have switched the extents. */ - if (VN_MAPPED(vp)) { + if (VN_MAPPED(VFS_I(ip))) { error = XFS_ERROR(EBUSY); goto error0; } @@ -264,7 +247,7 @@ xfs_swap_extents( * fields change. */ - bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); + xfs_tosspages(ip, 0, -1, FI_REMAPF); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, @@ -276,7 +259,7 @@ xfs_swap_extents( locked = 0; goto error0; } - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* * Count the number of extended attribute blocks @@ -361,15 +344,11 @@ xfs_swap_extents( break; } - /* - * Increment vnode ref counts since xfs_trans_commit & - * xfs_trans_cancel will both unlock the inodes and - * decrement the associated ref counts. - */ - VN_HOLD(vp); - VN_HOLD(tvp); + IHOLD(ip); xfs_trans_ijoin(tp, ip, lock_flags); + + IHOLD(tip); xfs_trans_ijoin(tp, tip, lock_flags); xfs_trans_log_inode(tp, ip, ilf_fields); @@ -383,7 +362,7 @@ xfs_swap_extents( xfs_trans_set_sync(tp); } - error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); locked = 0; error0: @@ -392,6 +371,6 @@ xfs_swap_extents( xfs_iunlock(tip, lock_flags); } if (tempifp != NULL) - kmem_free(tempifp, sizeof(xfs_ifork_t)); + kmem_free(tempifp); return error; } diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index b338269..c9065ea 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -34,41 +34,41 @@ struct xfs_mount; * because we only need the core part in the in-core inode. */ typedef struct xfs_timestamp { - __int32_t t_sec; /* timestamp seconds */ - __int32_t t_nsec; /* timestamp nanoseconds */ + __be32 t_sec; /* timestamp seconds */ + __be32 t_nsec; /* timestamp nanoseconds */ } xfs_timestamp_t; /* * Note: Coordinate changes to this structure with the XFS_DI_* #defines - * below and the offsets table in xfs_ialloc_log_di(). + * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode + * in xfs_inode.h. */ -typedef struct xfs_dinode_core -{ - __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __uint16_t di_mode; /* mode and type of file */ - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint16_t di_onlink; /* old number of links to file */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint32_t di_nlink; /* number of links to file */ - __uint16_t di_projid; /* owner's project id */ - __uint8_t di_pad[8]; /* unused, zeroed space */ - __uint16_t di_flushiter; /* incremented on flush */ +typedef struct xfs_dinode_core { + __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __be16 di_mode; /* mode and type of file */ + __u8 di_version; /* inode version */ + __u8 di_format; /* format of di_c data */ + __be16 di_onlink; /* old number of links to file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to file */ + __be16 di_projid; /* owner's project id */ + __u8 di_pad[8]; /* unused, zeroed space */ + __be16 di_flushiter; /* incremented on flush */ xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ - xfs_fsize_t di_size; /* number of bytes in file */ - xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ - xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint32_t di_gen; /* generation number */ + __be64 di_size; /* number of bytes in file */ + __be64 di_nblocks; /* # of direct & btree blocks used */ + __be32 di_extsize; /* basic/minimum extent size for file */ + __be32 di_nextents; /* number of extents in data fork */ + __be16 di_anextents; /* number of extents in attribute fork*/ + __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ + __s8 di_aformat; /* format of attr fork's data */ + __be32 di_dmevmask; /* DMIG event mask */ + __be16 di_dmstate; /* DMIG state info */ + __be16 di_flags; /* random flags, XFS_DIFLAG_... */ + __be32 di_gen; /* generation number */ } xfs_dinode_core_t; #define DI_MAX_FLUSH 0xffff @@ -81,13 +81,13 @@ typedef struct xfs_dinode * sure to update the macros like XFS_LITINO below and * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h. */ - xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ union { xfs_bmdr_block_t di_bmbt; /* btree root block */ xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */ xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */ char di_c[1]; /* local contents */ - xfs_dev_t di_dev; /* device for S_IFCHR/S_IFBLK */ + __be32 di_dev; /* device for S_IFCHR/S_IFBLK */ uuid_t di_muuid; /* mount point value */ char di_symlink[1]; /* local symbolic link */ } di_u; @@ -171,70 +171,35 @@ typedef enum xfs_dinode_fmt /* * Inode data & attribute fork sizes, per inode. */ -#define XFS_CFORK_Q(dcp) ((dcp)->di_forkoff != 0) -#define XFS_CFORK_Q_DISK(dcp) ((dcp)->di_forkoff != 0) - -#define XFS_CFORK_BOFF(dcp) ((int)((dcp)->di_forkoff << 3)) -#define XFS_CFORK_BOFF_DISK(dcp) \ - ((int)(INT_GET((dcp)->di_forkoff, ARCH_CONVERT) << 3)) - -#define XFS_CFORK_DSIZE_DISK(dcp,mp) \ - (XFS_CFORK_Q_DISK(dcp) ? XFS_CFORK_BOFF_DISK(dcp) : XFS_LITINO(mp)) -#define XFS_CFORK_DSIZE(dcp,mp) \ - (XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp)) - -#define XFS_CFORK_ASIZE_DISK(dcp,mp) \ - (XFS_CFORK_Q_DISK(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_DISK(dcp) : 0) -#define XFS_CFORK_ASIZE(dcp,mp) \ - (XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0) - -#define XFS_CFORK_SIZE_DISK(dcp,mp,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_CFORK_DSIZE_DISK(dcp, mp) : \ - XFS_CFORK_ASIZE_DISK(dcp, mp)) -#define XFS_CFORK_SIZE(dcp,mp,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp)) +#define XFS_DFORK_Q(dip) ((dip)->di_core.di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_core.di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp) -#define XFS_DFORK_DSIZE_HOST(dip,mp) \ - XFS_CFORK_DSIZE(&(dip)->di_core, mp) + (XFS_DFORK_Q(dip) ? \ + XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp) -#define XFS_DFORK_ASIZE_HOST(dip,mp) \ - XFS_CFORK_ASIZE(&(dip)->di_core, mp) -#define XFS_DFORK_SIZE(dip,mp,w) \ - XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w) -#define XFS_DFORK_SIZE_HOST(dip,mp,w) \ - XFS_CFORK_SIZE(&(dip)->di_core, mp, w) + (XFS_DFORK_Q(dip) ? \ + XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \ + 0) +#define XFS_DFORK_SIZE(dip,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_DFORK_DSIZE(dip, mp) : \ + XFS_DFORK_ASIZE(dip, mp)) -#define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core) -#define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core) -#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c) -#define XFS_DFORK_APTR(dip) \ +#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c) +#define XFS_DFORK_APTR(dip) \ ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip)) -#define XFS_DFORK_PTR(dip,w) \ +#define XFS_DFORK_PTR(dip,w) \ ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) -#define XFS_CFORK_FORMAT(dcp,w) \ - ((w) == XFS_DATA_FORK ? (dcp)->di_format : (dcp)->di_aformat) -#define XFS_CFORK_FMT_SET(dcp,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((dcp)->di_format = (n)) : ((dcp)->di_aformat = (n))) -#define XFS_DFORK_FORMAT(dip,w) XFS_CFORK_FORMAT(&(dip)->di_core, w) - -#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \ +#define XFS_DFORK_FORMAT(dip,w) \ ((w) == XFS_DATA_FORK ? \ - INT_GET((dcp)->di_nextents, ARCH_CONVERT) : \ - INT_GET((dcp)->di_anextents, ARCH_CONVERT)) -#define XFS_CFORK_NEXTENTS(dcp,w) \ - ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents) -#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w) -#define XFS_DFORK_NEXTENTS_HOST(dip,w) XFS_CFORK_NEXTENTS(&(dip)->di_core, w) - -#define XFS_CFORK_NEXT_SET(dcp,w,n) \ + (dip)->di_core.di_format : \ + (dip)->di_core.di_aformat) +#define XFS_DFORK_NEXTENTS(dip,w) \ ((w) == XFS_DATA_FORK ? \ - ((dcp)->di_nextents = (n)) : ((dcp)->di_anextents = (n))) + be32_to_cpu((dip)->di_core.di_nextents) : \ + be16_to_cpu((dip)->di_core.di_anextents)) #define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) @@ -257,6 +222,7 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) @@ -271,12 +237,19 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) + +#ifdef CONFIG_XFS_RT +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +#else +#define XFS_IS_REALTIME_INODE(ip) (0) +#endif #define XFS_DIFLAG_ANY \ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ - XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG) + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) #endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 8edbe1a..1afb122 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -42,22 +42,70 @@ #include "xfs_dir2_node.h" #include "xfs_dir2_trace.h" #include "xfs_error.h" +#include "xfs_vnodeops.h" -static int xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa); -static int xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa); +struct xfs_name xfs_name_dotdot = {"..", 2}; + +extern const struct xfs_nameops xfs_default_nameops; + +/* + * ASCII case-insensitive (ie. A-Z) support for directories that was + * used in IRIX. + */ +STATIC xfs_dahash_t +xfs_ascii_ci_hashname( + struct xfs_name *name) +{ + xfs_dahash_t hash; + int i; + + for (i = 0, hash = 0; i < name->len; i++) + hash = tolower(name->name[i]) ^ rol32(hash, 7); + + return hash; +} + +STATIC enum xfs_dacmp +xfs_ascii_ci_compname( + struct xfs_da_args *args, + const char *name, + int len) +{ + enum xfs_dacmp result; + int i; + + if (args->namelen != len) + return XFS_CMP_DIFFERENT; + + result = XFS_CMP_EXACT; + for (i = 0; i < len; i++) { + if (args->name[i] == name[i]) + continue; + if (tolower(args->name[i]) != tolower(name[i])) + return XFS_CMP_DIFFERENT; + result = XFS_CMP_CASE; + } + + return result; +} + +static struct xfs_nameops xfs_ascii_ci_nameops = { + .hashname = xfs_ascii_ci_hashname, + .compname = xfs_ascii_ci_compname, +}; void xfs_dir_mount( xfs_mount_t *mp) { - ASSERT(XFS_SB_VERSION_HASDIRV2(&mp->m_sb)); + ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= XFS_MAX_BLOCKSIZE); mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; - mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp)); - mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); - mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp)); + mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); + mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); + mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / (uint)sizeof(xfs_da_node_entry_t); @@ -65,6 +113,10 @@ xfs_dir_mount( (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / (uint)sizeof(xfs_da_node_entry_t); mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; + if (xfs_sb_version_hasasciici(&mp->m_sb)) + mp->m_dirnameops = &xfs_ascii_ci_nameops; + else + mp->m_dirnameops = &xfs_default_nameops; } /* @@ -147,8 +199,7 @@ int xfs_dir_createname( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, + struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ @@ -163,9 +214,10 @@ xfs_dir_createname( return rval; XFS_STATS_INC(xs_dir_create); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = inum; args.dp = dp; args.firstblock = first; @@ -173,8 +225,7 @@ xfs_dir_createname( args.total = total; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = 0; - args.addname = args.oknoent = 1; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_addname(&args); @@ -192,15 +243,43 @@ xfs_dir_createname( } /* + * If doing a CI lookup and case-insensitive match, dup actual name into + * args.value. Return EEXIST for success (ie. name found) or an error. + */ +int +xfs_dir_cilookup_result( + struct xfs_da_args *args, + const char *name, + int len) +{ + if (args->cmpresult == XFS_CMP_DIFFERENT) + return ENOENT; + if (args->cmpresult != XFS_CMP_CASE || + !(args->op_flags & XFS_DA_OP_CILOOKUP)) + return EEXIST; + + args->value = kmem_alloc(len, KM_MAYFAIL); + if (!args->value) + return ENOMEM; + + memcpy(args->value, name, len); + args->valuelen = len; + return EEXIST; +} + +/* * Lookup a name in a directory, give back the inode number. + * If ci_name is not NULL, returns the actual name in ci_name if it differs + * to name, or ci_name->name is set to NULL for an exact match. */ + int xfs_dir_lookup( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, - xfs_ino_t *inum) /* out: inode number */ + struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { xfs_da_args_t args; int rval; @@ -209,18 +288,16 @@ xfs_dir_lookup( ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_lookup); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = 0; + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.dp = dp; - args.firstblock = NULL; - args.flist = NULL; - args.total = 0; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = 0; - args.oknoent = 1; + args.op_flags = XFS_DA_OP_OKNOENT; + if (ci_name) + args.op_flags |= XFS_DA_OP_CILOOKUP; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_lookup(&args); @@ -236,8 +313,13 @@ xfs_dir_lookup( rval = xfs_dir2_node_lookup(&args); if (rval == EEXIST) rval = 0; - if (rval == 0) + if (!rval) { *inum = args.inumber; + if (ci_name) { + ci_name->name = args.value; + ci_name->len = args.valuelen; + } + } return rval; } @@ -248,8 +330,7 @@ int xfs_dir_removename( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, + struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ @@ -262,9 +343,10 @@ xfs_dir_removename( ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_remove); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = ino; args.dp = dp; args.firstblock = first; @@ -272,7 +354,6 @@ xfs_dir_removename( args.total = total; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = args.oknoent = 0; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_removename(&args); @@ -293,47 +374,33 @@ xfs_dir_removename( * Read a directory. */ int -xfs_dir_getdents( - xfs_trans_t *tp, +xfs_readdir( xfs_inode_t *dp, - uio_t *uio, /* caller's buffer control */ - int *eofp) /* out: eof reached */ + void *dirent, + size_t bufsize, + xfs_off_t *offset, + filldir_t filldir) { - int alignment; /* alignment required for ABI */ - xfs_dirent_t *dbp; /* malloc'ed buffer */ - xfs_dir2_put_t put; /* entry formatting routine */ int rval; /* return value */ int v; /* type-checking value */ + xfs_itrace_entry(dp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_getdents); - /* - * If our caller has given us a single contiguous aligned memory buffer, - * just work directly within that buffer. If it's in user memory, - * lock it down first. - */ - alignment = sizeof(xfs_off_t) - 1; - if ((uio->uio_iovcnt == 1) && - (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) && - ((uio->uio_iov[0].iov_len & alignment) == 0)) { - dbp = NULL; - put = xfs_dir2_put_dirent64_direct; - } else { - dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP); - put = xfs_dir2_put_dirent64_uio; - } - *eofp = 0; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) + rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir); + else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) ; else if (v) - rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put); + rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir); else - rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put); - if (dbp != NULL) - kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN); + rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset, + filldir); return rval; } @@ -344,8 +411,7 @@ int xfs_dir_replace( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, /* name of entry to replace */ - int namelen, + struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ @@ -360,9 +426,10 @@ xfs_dir_replace( if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = inum; args.dp = dp; args.firstblock = first; @@ -370,7 +437,6 @@ xfs_dir_replace( args.total = total; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = args.oknoent = 0; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_replace(&args); @@ -389,31 +455,33 @@ xfs_dir_replace( /* * See if this entry can be added to the directory without allocating space. + * First checks that the caller couldn't reserve enough space (resblks = 0). */ int xfs_dir_canenter( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, /* name of entry to add */ - int namelen) + struct xfs_name *name, /* name of entry to add */ + uint resblks) { xfs_da_args_t args; int rval; int v; /* type-checking value */ + if (resblks) + return 0; + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = 0; + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.dp = dp; - args.firstblock = NULL; - args.flist = NULL; - args.total = 0; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = args.oknoent = 1; + args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + XFS_DA_OP_OKNOENT; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_addname(&args); @@ -457,11 +525,13 @@ xfs_dir2_grow_inode( xfs_mount_t *mp; int nmap; /* number of bmap entries */ xfs_trans_t *tp; + xfs_drfsbno_t nblks; xfs_dir2_trace_args_s("grow_inode", args, space); dp = args->dp; tp = args->trans; mp = dp->i_mount; + nblks = dp->i_d.di_nblocks; /* * Set lowest possible block in the space requested. */ @@ -514,7 +584,7 @@ xfs_dir2_grow_inode( args->firstblock, args->total, &mapp[mapi], &nmap, args->flist, NULL))) { - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return error; } if (nmap < 1) @@ -546,15 +616,19 @@ xfs_dir2_grow_inode( mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != bno + count) { if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); + kmem_free(mapp); return XFS_ERROR(ENOSPC); } /* * Done with the temporary mapping table. */ if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); - *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno); + kmem_free(mapp); + + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; + *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); + /* * Update file's size if this is the data space and it grew. */ @@ -613,77 +687,6 @@ xfs_dir2_isleaf( } /* - * Getdents put routine for 64-bit ABI, direct form. - */ -static int -xfs_dir2_put_dirent64_direct( - xfs_dir2_put_args_t *pa) -{ - xfs_dirent_t *idbp; /* dirent pointer */ - iovec_t *iovp; /* io vector */ - int namelen; /* entry name length */ - int reclen; /* entry total length */ - uio_t *uio; /* I/O control */ - - namelen = pa->namelen; - reclen = DIRENTSIZE(namelen); - uio = pa->uio; - /* - * Won't fit in the remaining space. - */ - if (reclen > uio->uio_resid) { - pa->done = 0; - return 0; - } - iovp = uio->uio_iov; - idbp = (xfs_dirent_t *)iovp->iov_base; - iovp->iov_base = (char *)idbp + reclen; - iovp->iov_len -= reclen; - uio->uio_resid -= reclen; - idbp->d_reclen = reclen; - idbp->d_ino = pa->ino; - idbp->d_off = pa->cook; - idbp->d_name[namelen] = '\0'; - pa->done = 1; - memcpy(idbp->d_name, pa->name, namelen); - return 0; -} - -/* - * Getdents put routine for 64-bit ABI, uio form. - */ -static int -xfs_dir2_put_dirent64_uio( - xfs_dir2_put_args_t *pa) -{ - xfs_dirent_t *idbp; /* dirent pointer */ - int namelen; /* entry name length */ - int reclen; /* entry total length */ - int rval; /* return value */ - uio_t *uio; /* I/O control */ - - namelen = pa->namelen; - reclen = DIRENTSIZE(namelen); - uio = pa->uio; - /* - * Won't fit in the remaining space. - */ - if (reclen > uio->uio_resid) { - pa->done = 0; - return 0; - } - idbp = pa->dbp; - idbp->d_reclen = reclen; - idbp->d_ino = pa->ino; - idbp->d_off = pa->cook; - idbp->d_name[namelen] = '\0'; - memcpy(idbp->d_name, pa->name, namelen); - rval = uio_read((caddr_t)idbp, reclen, uio); - pa->done = (rval == 0); - return rval; -} - -/* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. @@ -706,7 +709,7 @@ xfs_dir2_shrink_inode( dp = args->dp; mp = dp->i_mount; tp = args->trans; - da = XFS_DIR2_DB_TO_DA(mp, db); + da = xfs_dir2_db_to_da(mp, db); /* * Unmap the fsblock(s). */ @@ -742,7 +745,7 @@ xfs_dir2_shrink_inode( /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0)) + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 86560b6..1d9ef96 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -59,20 +59,7 @@ typedef __uint32_t xfs_dir2_db_t; */ typedef xfs_off_t xfs_dir2_off_t; -/* - * For getdents, argument struct for put routines. - */ -typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa); -typedef struct xfs_dir2_put_args { - xfs_off_t cook; /* cookie of (next) entry */ - xfs_intino_t ino; /* inode number */ - xfs_dirent_t *dbp; /* buffer pointer */ - char *name; /* directory entry name */ - int namelen; /* length of name */ - int done; /* output: set if value was stored */ - xfs_dir2_put_t put; /* put function ptr (i/o) */ - struct uio *uio; /* uio control structure */ -} xfs_dir2_put_args_t; +extern struct xfs_name xfs_name_dotdot; /* * Generic directory interface routines @@ -83,23 +70,22 @@ extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t inum, + struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t *inum); + struct xfs_name *name, xfs_ino_t *inum, + struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t ino, + struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); -extern int xfs_dir_getdents(struct xfs_trans *tp, struct xfs_inode *dp, - uio_t *uio, int *eofp); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t inum, + struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen); + struct xfs_name *name, uint resblks); extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); /* @@ -114,4 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_dabuf *bp); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, + int len); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 9d7438b..e1f0a06 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -22,6 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -115,13 +116,13 @@ xfs_dir2_block_addname( xfs_da_brelse(tp, bp); return XFS_ERROR(EFSCORRUPTED); } - len = XFS_DIR2_DATA_ENTSIZE(args->namelen); + len = xfs_dir2_data_entsize(args->namelen); /* * Set up pointers to parts of the block. */ bf = block->hdr.bestfree; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * No stale entries? Need space for entry and new leaf. */ @@ -214,7 +215,7 @@ xfs_dir2_block_addname( /* * If this isn't a real add, we're done with the buffer. */ - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) xfs_da_brelse(tp, bp); /* * If we don't have space for the new entry & leaf ... @@ -224,7 +225,7 @@ xfs_dir2_block_addname( * Not trying to actually do anything, or don't have * a space reservation: return no-space. */ - if (args->justcheck || args->total == 0) + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. @@ -239,7 +240,7 @@ xfs_dir2_block_addname( /* * Just checking, and it would work, so say so. */ - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; needlog = needscan = 0; /* @@ -270,7 +271,7 @@ xfs_dir2_block_addname( } lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); lfloghigh -= be32_to_cpu(btp->stale) - 1; - be32_add(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); xfs_dir2_data_make_free(tp, bp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), @@ -282,8 +283,7 @@ xfs_dir2_block_addname( * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, - &needlog, NULL); + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); needscan = 0; } } @@ -326,14 +326,14 @@ xfs_dir2_block_addname( /* * Update the tail (entry count). */ - be32_add(&btp->count, 1); + be32_add_cpu(&btp->count, 1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) { xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, - &needlog, NULL); + &needlog); needscan = 0; } /* @@ -387,7 +387,7 @@ xfs_dir2_block_addname( lfloglow = MIN(mid, lfloglow); lfloghigh = MAX(highstale, lfloghigh); } - be32_add(&btp->stale, -1); + be32_add_cpu(&btp->stale, -1); } /* * Point to the new data entry. @@ -397,7 +397,7 @@ xfs_dir2_block_addname( * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); - blp[mid].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* @@ -412,14 +412,13 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); if (needlog) xfs_dir2_data_log_header(tp, bp); xfs_dir2_block_log_tail(tp, bp); @@ -434,12 +433,10 @@ xfs_dir2_block_addname( */ int /* error */ xfs_dir2_block_getdents( - xfs_trans_t *tp, /* transaction (NULL) */ xfs_inode_t *dp, /* incore inode */ - uio_t *uio, /* caller's buffer control */ - int *eofp, /* eof reached? (out) */ - xfs_dirent_t *dbp, /* caller's buffer */ - xfs_dir2_put_t put) /* abi's formatting function */ + void *dirent, + xfs_off_t *offset, + filldir_t filldir) { xfs_dir2_block_t *block; /* directory block structure */ xfs_dabuf_t *bp; /* buffer for block */ @@ -449,42 +446,41 @@ xfs_dir2_block_getdents( char *endptr; /* end of the data entries */ int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_put_args_t p; /* arg package for put rtn */ char *ptr; /* current data entry */ int wantoff; /* starting block offset */ + xfs_ino_t ino; + xfs_off_t cook; mp = dp->i_mount; /* * If the block number in the offset is out of range, we're done. */ - if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) { - *eofp = 1; + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { return 0; } /* * Can't read the block, give up, else get dabuf in bp. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, + &bp, XFS_DATA_FORK); + if (error) return error; - } + ASSERT(bp != NULL); /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ - wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset); + wantoff = xfs_dir2_dataptr_to_off(mp, *offset); block = bp->data; xfs_dir2_data_check(dp, bp); /* * Set up values for the loop. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); ptr = (char *)block->u; - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); - p.dbp = dbp; - p.put = put; - p.uio = uio; + endptr = (char *)xfs_dir2_block_leaf_p(btp); + /* * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). @@ -504,39 +500,28 @@ xfs_dir2_block_getdents( /* * Bump pointer for the next iteration. */ - ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + ptr += xfs_dir2_data_entsize(dep->namelen); /* * The entry is before the desired starting point, skip it. */ if ((char *)dep - (char *)block < wantoff) continue; - /* - * Set up argument structure for put routine. - */ - p.namelen = dep->namelen; - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - ptr - (char *)block); - p.ino = be64_to_cpu(dep->inumber); + cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + (char *)dep - (char *)block); + ino = be64_to_cpu(dep->inumber); #if XFS_BIG_INUMS - p.ino += mp->m_inoadd; + ino += mp->m_inoadd; #endif - p.name = (char *)dep->name; - - /* - * Put the entry in the caller's buffer. - */ - error = p.put(&p); /* * If it didn't fit, set the final offset to here & return. */ - if (!p.done) { - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - (char *)dep - (char *)block); - xfs_da_brelse(tp, bp); - return error; + if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, + ino, DT_UNKNOWN)) { + *offset = cook & 0x7fffffff; + xfs_da_brelse(NULL, bp); + return 0; } } @@ -544,13 +529,9 @@ xfs_dir2_block_getdents( * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ - *eofp = 1; - - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); - - xfs_da_brelse(tp, bp); - + *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + 0x7fffffff; + xfs_da_brelse(NULL, bp); return 0; } @@ -571,8 +552,8 @@ xfs_dir2_block_log_leaf( mp = tp->t_mountp; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), (uint)((char *)&blp[last + 1] - (char *)block - 1)); } @@ -591,7 +572,7 @@ xfs_dir2_block_log_tail( mp = tp->t_mountp; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), (uint)((char *)(btp + 1) - (char *)block - 1)); } @@ -625,19 +606,20 @@ xfs_dir2_block_lookup( mp = dp->i_mount; block = bp->data; xfs_dir2_data_check(dp, bp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)block + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); /* - * Fill in inode number, release the block. + * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_da_brelse(args->trans, bp); - return XFS_ERROR(EEXIST); + return XFS_ERROR(error); } /* @@ -663,6 +645,7 @@ xfs_dir2_block_lookup_int( int mid; /* binary search current idx */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ dp = args->dp; tp = args->trans; @@ -677,8 +660,8 @@ xfs_dir2_block_lookup_int( ASSERT(bp != NULL); block = bp->data; xfs_dir2_data_check(dp, bp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. * Find our entry, ENOENT if it's not there. @@ -693,7 +676,7 @@ xfs_dir2_block_lookup_int( else high = mid - 1; if (low > high) { - ASSERT(args->oknoent); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); xfs_da_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -715,22 +698,33 @@ xfs_dir2_block_lookup_int( * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); /* - * Compare, if it's right give back buffer & entry number. + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; *bpp = bp; *entno = mid; - return 0; + if (cmp == XFS_CMP_EXACT) + return 0; } - } while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash); + } while (++mid < be32_to_cpu(btp->count) && + be32_to_cpu(blp[mid].hashval) == hash); + + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was found earlier, return success. + */ + if (args->cmpresult == XFS_CMP_CASE) + return 0; /* * No match, release the buffer and return ENOENT. */ - ASSERT(args->oknoent); xfs_da_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -770,24 +764,24 @@ xfs_dir2_block_removename( tp = args->trans; mp = dp->i_mount; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; xfs_dir2_data_make_free(tp, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ - be32_add(&btp->stale, 1); + be32_add_cpu(&btp->stale, 1); xfs_dir2_block_log_tail(tp, bp); /* * Remove the leaf entry by marking it stale. @@ -798,8 +792,7 @@ xfs_dir2_block_removename( * Fix up bestfree, log the header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); if (needlog) xfs_dir2_data_log_header(tp, bp); xfs_dir2_data_check(dp, bp); @@ -846,13 +839,13 @@ xfs_dir2_block_replace( dp = args->dp; mp = dp->i_mount; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. @@ -915,7 +908,7 @@ xfs_dir2_leaf_to_block( mp = dp->i_mount; leaf = lbp->data; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have @@ -923,7 +916,7 @@ xfs_dir2_leaf_to_block( * These will show up in the leaf bests table. */ while (dp->i_d.di_size > mp->m_dirblksize) { - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == mp->m_dirblksize - (uint)sizeof(block->hdr)) { if ((error = @@ -977,14 +970,14 @@ xfs_dir2_leaf_to_block( /* * Initialize the block tail. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ - lep = XFS_DIR2_BLOCK_LEAF_P(btp); + lep = xfs_dir2_block_leaf_p(btp); for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) continue; @@ -996,8 +989,7 @@ xfs_dir2_leaf_to_block( * Scan the bestfree if we need it and log the data block header. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); /* @@ -1055,6 +1047,7 @@ xfs_dir2_sf_to_block( xfs_dir2_sf_t *sfp; /* shortform structure */ __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_name name; xfs_dir2_trace_args("sf_to_block", args); dp = args->dp; @@ -1071,7 +1064,7 @@ xfs_dir2_sf_to_block( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Copy the directory into the stack buffer. * Then pitch the incore inode data so we can make extents. @@ -1093,7 +1086,7 @@ xfs_dir2_sf_to_block( */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); if (error) { - kmem_free(buf, buf_len); + kmem_free(buf); return error; } /* @@ -1101,7 +1094,7 @@ xfs_dir2_sf_to_block( */ error = xfs_dir2_data_init(args, blkno, &bp); if (error) { - kmem_free(buf, buf_len); + kmem_free(buf); return error; } block = bp->data; @@ -1123,10 +1116,10 @@ xfs_dir2_sf_to_block( /* * Fill in the tail. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */ btp->stale = 0; - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + blp = xfs_dir2_block_leaf_p(btp); endoffset = (uint)((char *)blp - (char *)block); /* * Remove the freespace, we'll manage it. @@ -1142,25 +1135,25 @@ xfs_dir2_sf_to_block( dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); xfs_dir2_data_log_entry(tp, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); /* * Create entry for .. */ dep = (xfs_dir2_data_entry_t *) ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET); - dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); xfs_dir2_data_log_entry(tp, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); offset = XFS_DIR2_DATA_FIRST_OFFSET; /* @@ -1169,7 +1162,7 @@ xfs_dir2_sf_to_block( if ((i = 0) == sfp->hdr.count) sfep = NULL; else - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + sfep = xfs_dir2_sf_firstentry(sfp); /* * Need to preserve the existing offset values in the sf directory. * Insert holes (unused entries) where necessary. @@ -1181,7 +1174,7 @@ xfs_dir2_sf_to_block( if (sfep == NULL) newoffset = endoffset; else - newoffset = XFS_DIR2_SF_GET_OFFSET(sfep); + newoffset = xfs_dir2_sf_get_offset(sfep); /* * There should be a hole here, make one. */ @@ -1190,7 +1183,7 @@ xfs_dir2_sf_to_block( ((char *)block + offset); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); - *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16( + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( ((char *)dup - (char *)block)); xfs_dir2_data_log_unused(tp, bp, dup); (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block, @@ -1202,25 +1195,27 @@ xfs_dir2_sf_to_block( * Copy a real entry. */ dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset); - dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep))); + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep))); dep->namelen = sfep->namelen; memcpy(dep->name, sfep->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); xfs_dir2_data_log_entry(tp, bp, dep); - blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname( - (char *)sfep->name, sfep->namelen)); - blp[2 + i].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + name.name = sfep->name; + name.len = sfep->namelen; + blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> + hashname(&name)); + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); offset = (int)((char *)(tagp + 1) - (char *)block); if (++i == sfp->hdr.count) sfep = NULL; else - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } /* Done with the temporary buffer */ - kmem_free(buf, buf_len); + kmem_free(buf); /* * Sort the leaf entries by hash value. */ diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h index 6722eff..10e6896 100644 --- a/fs/xfs/xfs_dir2_block.h +++ b/fs/xfs/xfs_dir2_block.h @@ -60,7 +60,6 @@ typedef struct xfs_dir2_block { /* * Pointer to the leaf header embedded in a data block (1-block format) */ -#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block) static inline xfs_dir2_block_tail_t * xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) { @@ -71,7 +70,6 @@ xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) /* * Pointer to the leaf entries embedded in a data block (1-block format) */ -#define XFS_DIR2_BLOCK_LEAF_P(btp) xfs_dir2_block_leaf_p(btp) static inline struct xfs_dir2_leaf_entry * xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp) { @@ -82,9 +80,8 @@ xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp) * Function declarations. */ extern int xfs_dir2_block_addname(struct xfs_da_args *args); -extern int xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp, - struct uio *uio, int *eofp, - struct xfs_dirent *dbp, xfs_dir2_put_t put); +extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, + xfs_off_t *offset, filldir_t filldir); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index f7c7992..498f8d6 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -22,6 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -64,6 +65,7 @@ xfs_dir2_data_check( xfs_mount_t *mp; /* filesystem mount point */ char *p; /* current data position */ int stale; /* count of stale leaves */ + struct xfs_name name; mp = dp->i_mount; d = bp->data; @@ -72,8 +74,8 @@ xfs_dir2_data_check( bf = d->hdr.bestfree; p = (char *)d->u; if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - lep = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; } else endp = (char *)d + mp->m_dirblksize; @@ -107,7 +109,7 @@ xfs_dir2_data_check( */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup)) == + ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)d); dfp = xfs_dir2_data_freefind(d, dup); if (dfp) { @@ -131,15 +133,17 @@ xfs_dir2_data_check( dep = (xfs_dir2_data_entry_t *)p; ASSERT(dep->namelen != 0); ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep)) == + ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == (char *)dep - (char *)d); count++; lastfree = 0; if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)d)); - hash = xfs_da_hashname((char *)dep->name, dep->namelen); + name.name = dep->name; + name.len = dep->namelen; + hash = mp->m_dirnameops->hashname(&name); for (i = 0; i < be32_to_cpu(btp->count); i++) { if (be32_to_cpu(lep[i].address) == addr && be32_to_cpu(lep[i].hashval) == hash) @@ -147,7 +151,7 @@ xfs_dir2_data_check( } ASSERT(i < be32_to_cpu(btp->count)); } - p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + p += xfs_dir2_data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. @@ -324,8 +328,7 @@ void xfs_dir2_data_freescan( xfs_mount_t *mp, /* filesystem mount point */ xfs_dir2_data_t *d, /* data block pointer */ - int *loghead, /* out: log data header */ - char *aendp) /* in: caller's endp */ + int *loghead) /* out: log data header */ { xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ @@ -346,11 +349,9 @@ xfs_dir2_data_freescan( * Set up pointers. */ p = (char *)d->u; - if (aendp) - endp = aendp; - else if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + endp = (char *)xfs_dir2_block_leaf_p(btp); } else endp = (char *)d + mp->m_dirblksize; /* @@ -363,7 +364,7 @@ xfs_dir2_data_freescan( */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ASSERT((char *)dup - (char *)d == - be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); xfs_dir2_data_freeinsert(d, dup, loghead); p += be16_to_cpu(dup->length); } @@ -373,8 +374,8 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)d == - be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep))); - p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); + p += xfs_dir2_data_entsize(dep->namelen); } } } @@ -405,7 +406,7 @@ xfs_dir2_data_init( /* * Get the buffer set up for the block. */ - error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp, + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, XFS_DATA_FORK); if (error) { return error; @@ -430,7 +431,7 @@ xfs_dir2_data_init( t=mp->m_dirblksize - (uint)sizeof(d->hdr); d->hdr.bestfree[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); - *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16((char *)dup - (char *)d); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)d); /* * Log it and return it. */ @@ -455,7 +456,7 @@ xfs_dir2_data_log_entry( ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d), - (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) - + (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - (char *)d - 1)); } @@ -500,8 +501,8 @@ xfs_dir2_data_log_unused( * Log the end (tag) of the unused entry. */ xfs_da_log_buf(tp, bp, - (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d), - (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d + + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d), + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d + sizeof(xfs_dir2_data_off_t) - 1)); } @@ -538,8 +539,8 @@ xfs_dir2_data_make_free( xfs_dir2_block_tail_t *btp; /* block tail */ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + endptr = (char *)xfs_dir2_block_leaf_p(btp); } /* * If this isn't the start of the block, then back up to @@ -589,8 +590,8 @@ xfs_dir2_data_make_free( /* * Fix up the new big freespace. */ - be16_add(&prevdup->length, len + be16_to_cpu(postdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = + be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, prevdup); if (!needscan) { @@ -623,8 +624,8 @@ xfs_dir2_data_make_free( */ else if (prevdup) { dfp = xfs_dir2_data_freefind(d, prevdup); - be16_add(&prevdup->length, len); - *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = + be16_add_cpu(&prevdup->length, len); + *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, prevdup); /* @@ -652,7 +653,7 @@ xfs_dir2_data_make_free( newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); /* @@ -679,7 +680,7 @@ xfs_dir2_data_make_free( newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); @@ -715,7 +716,7 @@ xfs_dir2_data_use_free( ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); ASSERT(offset >= (char *)dup - (char *)d); ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d); - ASSERT((char *)dup - (char *)d == be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); + ASSERT((char *)dup - (char *)d == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); /* * Look up the entry in the bestfree table. */ @@ -748,7 +749,7 @@ xfs_dir2_data_use_free( newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(oldlen - len); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); /* @@ -775,7 +776,7 @@ xfs_dir2_data_use_free( else if (matchback) { newdup = dup; newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); /* @@ -802,13 +803,13 @@ xfs_dir2_data_use_free( else { newdup = dup; newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len); newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup2) = + *xfs_dir2_data_unused_tag_p(newdup2) = cpu_to_be16((char *)newdup2 - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup2); /* diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h index a6ae2d2..b816e02 100644 --- a/fs/xfs/xfs_dir2_data.h +++ b/fs/xfs/xfs_dir2_data.h @@ -44,7 +44,7 @@ struct xfs_trans; #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) #define XFS_DIR2_DATA_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET) + xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) /* * Offsets of . and .. in data space (always block 0) @@ -52,9 +52,9 @@ struct xfs_trans; #define XFS_DIR2_DATA_DOT_OFFSET \ ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t)) #define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1)) + (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) #define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2)) + (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) /* * Structures. @@ -123,7 +123,6 @@ typedef struct xfs_dir2_data { /* * Size of a data entry. */ -#define XFS_DIR2_DATA_ENTSIZE(n) xfs_dir2_data_entsize(n) static inline int xfs_dir2_data_entsize(int n) { return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \ @@ -133,19 +132,16 @@ static inline int xfs_dir2_data_entsize(int n) /* * Pointer to an entry's tag word. */ -#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) xfs_dir2_data_entry_tag_p(dep) static inline __be16 * xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep) { return (__be16 *)((char *)dep + - XFS_DIR2_DATA_ENTSIZE(dep->namelen) - sizeof(__be16)); + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); } /* * Pointer to a freespace's tag word. */ -#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \ - xfs_dir2_data_unused_tag_p(dup) static inline __be16 * xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup) { @@ -166,7 +162,7 @@ extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d, extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d, xfs_dir2_data_unused_t *dup, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d, - int *loghead, char *aendp); + int *loghead); extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_dabuf **bpp); extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index b1cf1fb..ef805a3 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -92,7 +92,7 @@ xfs_dir2_block_to_leaf( if ((error = xfs_da_grow_inode(args, &blkno))) { return error; } - ldb = XFS_DIR2_DA_TO_DB(mp, blkno); + ldb = xfs_dir2_da_to_db(mp, blkno); ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); /* * Initialize the leaf block, get a buffer for it. @@ -104,8 +104,8 @@ xfs_dir2_block_to_leaf( leaf = lbp->data; block = dbp->data; xfs_dir2_data_check(dp, dbp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Set the counts in the leaf header. */ @@ -133,14 +133,13 @@ xfs_dir2_block_to_leaf( */ block->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); /* * Set up leaf tail and bests table. */ - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = cpu_to_be32(1); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + bestsp = xfs_dir2_leaf_bests_p(ltp); bestsp[0] = block->hdr.bestfree[0].length; /* * Log the data header and leaf bests table. @@ -210,9 +209,9 @@ xfs_dir2_leaf_addname( */ index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + length = xfs_dir2_data_entsize(args->namelen); /* * See if there are any entries with the same hash value * and space in their block for the new entry. @@ -224,7 +223,7 @@ xfs_dir2_leaf_addname( index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; - i = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); ASSERT(i < be32_to_cpu(ltp->bestcount)); ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF); if (be16_to_cpu(bestsp[i]) >= length) { @@ -264,20 +263,21 @@ xfs_dir2_leaf_addname( * If we don't have enough free bytes but we can make enough * by compacting out stale entries, we'll do that. */ - if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < needbytes && - be16_to_cpu(leaf->hdr.stale) > 1) { + if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < + needbytes && be16_to_cpu(leaf->hdr.stale) > 1) { compact = 1; } /* * Otherwise if we don't have enough free bytes we need to * convert to node form. */ - else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < - needbytes) { + else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu( + leaf->hdr.count)] < needbytes) { /* * Just checking or no space reservation, give up. */ - if (args->justcheck || args->total == 0) { + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { xfs_da_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } @@ -302,7 +302,7 @@ xfs_dir2_leaf_addname( * If just checking, then it will fit unless we needed to allocate * a new data block. */ - if (args->justcheck) { + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_da_brelse(tp, lbp); return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; } @@ -360,7 +360,7 @@ xfs_dir2_leaf_addname( bestsp--; memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); - be32_add(<p->bestcount, 1); + be32_add_cpu(<p->bestcount, 1); xfs_dir2_leaf_log_tail(tp, lbp); xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } @@ -379,7 +379,7 @@ xfs_dir2_leaf_addname( */ else { if ((error = - xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block), + xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), -1, &dbp, XFS_DATA_FORK))) { xfs_da_brelse(tp, lbp); return error; @@ -408,13 +408,13 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)data); /* * Need to scan fix up the bestfree table. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(mp, data, &needlog); /* * Need to log the data block's header. */ @@ -446,7 +446,7 @@ xfs_dir2_leaf_addname( */ lfloglow = index; lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add(&leaf->hdr.count, 1); + be16_add_cpu(&leaf->hdr.count, 1); } /* * There are stale entries. @@ -524,13 +524,13 @@ xfs_dir2_leaf_addname( lfloglow = MIN(index, lfloglow); lfloghigh = MAX(highstale, lfloghigh); } - be16_add(&leaf->hdr.stale, -1); + be16_add_cpu(&leaf->hdr.stale, -1); } /* * Fill in the new leaf entry. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block, be16_to_cpu(*tagp))); /* * Log the leaf fields and give up the buffers. @@ -568,13 +568,13 @@ xfs_dir2_leaf_check( * Should factor in the size of the bests table as well. * We can deduce a value for that from di_size. */ - ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * Leaves and bests don't overlap. */ ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= - (char *)XFS_DIR2_LEAF_BESTS_P(ltp)); + (char *)xfs_dir2_leaf_bests_p(ltp)); /* * Check hash value order, count stale entries. */ @@ -627,7 +627,7 @@ xfs_dir2_leaf_compact( * Update and log the header, log the leaf entries. */ ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to); - be16_add(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); + be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); leaf->hdr.stale = 0; xfs_dir2_leaf_log_header(args->trans, bp); if (loglow != -1) @@ -729,7 +729,7 @@ xfs_dir2_leaf_compact_x1( /* * Adjust the leaf header values. */ - be16_add(&leaf->hdr.count, -(from - to)); + be16_add_cpu(&leaf->hdr.count, -(from - to)); leaf->hdr.stale = cpu_to_be16(1); /* * Remember the low/high stale value only in the "right" @@ -750,12 +750,11 @@ xfs_dir2_leaf_compact_x1( */ int /* error */ xfs_dir2_leaf_getdents( - xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *dp, /* incore directory inode */ - uio_t *uio, /* I/O control & vectors */ - int *eofp, /* out: reached end of dir */ - xfs_dirent_t *dbp, /* caller's buffer */ - xfs_dir2_put_t put) /* ABI formatting routine */ + void *dirent, + size_t bufsize, + xfs_off_t *offset, + filldir_t filldir) { xfs_dabuf_t *bp; /* data block buffer */ int byteoff; /* offset in current block */ @@ -764,7 +763,6 @@ xfs_dir2_leaf_getdents( xfs_dir2_data_t *data; /* data block structure */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ - int eof; /* reached end of directory */ int error = 0; /* error return value */ int i; /* temporary loop index */ int j; /* temporary loop index */ @@ -777,51 +775,43 @@ xfs_dir2_leaf_getdents( xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_off_t newoff; /* new curoff after new blk */ int nmap; /* mappings to ask xfs_bmapi */ - xfs_dir2_put_args_t *p; /* formatting arg bundle */ char *ptr = NULL; /* pointer to current data */ int ra_current; /* number of read-ahead blks */ int ra_index; /* *map index for read-ahead */ int ra_offset; /* map entry offset for ra */ int ra_want; /* readahead count wanted */ + xfs_ino_t ino; /* * If the offset is at or past the largest allowed value, - * give up right away, return eof. + * give up right away. */ - if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) { - *eofp = 1; + if (*offset >= XFS_DIR2_MAX_DATAPTR) return 0; - } + mp = dp->i_mount; - /* - * Setup formatting arguments. - */ - p = kmem_alloc(sizeof(*p), KM_SLEEP); - p->dbp = dbp; - p->put = put; - p->uio = uio; + /* * Set up to bmap a number of blocks based on the caller's * buffer size, the directory block size, and the filesystem * block size. */ - map_size = - howmany(uio->uio_resid + mp->m_dirblksize, - mp->m_sb.sb_blocksize); + map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize); map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP); map_valid = ra_index = ra_offset = ra_current = map_blocks = 0; bp = NULL; - eof = 1; + /* * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ - curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset); + curoff = xfs_dir2_dataptr_to_byte(mp, *offset); + /* * Force this conversion through db so we truncate the offset * down to get the start of the data block. */ - map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff)); + map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff)); /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. @@ -837,7 +827,7 @@ xfs_dir2_leaf_getdents( * take it out of the mapping. */ if (bp) { - xfs_da_brelse(tp, bp); + xfs_da_brelse(NULL, bp); bp = NULL; map_blocks -= mp->m_dirblkfsbs; /* @@ -863,23 +853,24 @@ xfs_dir2_leaf_getdents( /* * Recalculate the readahead blocks wanted. */ - ra_want = howmany(uio->uio_resid + mp->m_dirblksize, + ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; + /* * If we don't have as many as we want, and we haven't * run out of data blocks, get some more mappings. */ if (1 + ra_want > map_blocks && map_off < - XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) { + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { /* * Get more bmaps, fill in after the ones * we already have in the table. */ nmap = map_size - map_valid; - error = xfs_bmapi(tp, dp, + error = xfs_bmapi(NULL, dp, map_off, - XFS_DIR2_BYTE_TO_DA(mp, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, XFS_BMAPI_METADATA, NULL, 0, &map[map_valid], &nmap, NULL, NULL); @@ -904,7 +895,7 @@ xfs_dir2_leaf_getdents( map[map_valid + nmap - 1].br_blockcount; else map_off = - XFS_DIR2_BYTE_TO_DA(mp, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET); /* * Look for holes in the mapping, and @@ -932,15 +923,15 @@ xfs_dir2_leaf_getdents( * No valid mappings, so no more data blocks. */ if (!map_valid) { - curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off); + curoff = xfs_dir2_da_to_byte(mp, map_off); break; } /* * Read the directory block starting at the first * mapping. */ - curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff); - error = xfs_da_read_buf(tp, dp, map->br_startoff, + curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + error = xfs_da_read_buf(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, @@ -983,7 +974,7 @@ xfs_dir2_leaf_getdents( * is a very rare case. */ else if (i > ra_current) { - (void)xfs_da_reada_buf(tp, dp, + (void)xfs_da_reada_buf(NULL, dp, map[ra_index].br_startoff + ra_offset, XFS_DATA_FORK); ra_current = i; @@ -1015,7 +1006,7 @@ xfs_dir2_leaf_getdents( /* * Having done a read, we need to set a new offset. */ - newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0); + newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0); /* * Start of the current block. */ @@ -1025,7 +1016,7 @@ xfs_dir2_leaf_getdents( * Make sure we're in the right block. */ else if (curoff > newoff) - ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) == + ASSERT(xfs_dir2_byte_to_db(mp, curoff) == curdb); data = bp->data; xfs_dir2_data_check(dp, bp); @@ -1033,7 +1024,7 @@ xfs_dir2_leaf_getdents( * Find our position in the block. */ ptr = (char *)&data->u; - byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff); + byteoff = xfs_dir2_byte_to_off(mp, curoff); /* * Skip past the header. */ @@ -1055,15 +1046,15 @@ xfs_dir2_leaf_getdents( } dep = (xfs_dir2_data_entry_t *)ptr; length = - XFS_DIR2_DATA_ENTSIZE(dep->namelen); + xfs_dir2_data_entsize(dep->namelen); ptr += length; } /* * Now set our real offset. */ curoff = - XFS_DIR2_DB_OFF_TO_BYTE(mp, - XFS_DIR2_BYTE_TO_DB(mp, curoff), + xfs_dir2_db_off_to_byte(mp, + xfs_dir2_byte_to_db(mp, curoff), (char *)ptr - (char *)data); if (ptr >= (char *)data + mp->m_dirblksize) { continue; @@ -1090,46 +1081,39 @@ xfs_dir2_leaf_getdents( */ dep = (xfs_dir2_data_entry_t *)ptr; - p->namelen = dep->namelen; - - length = XFS_DIR2_DATA_ENTSIZE(p->namelen); + length = xfs_dir2_data_entsize(dep->namelen); - p->cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length); - - p->ino = be64_to_cpu(dep->inumber); + ino = be64_to_cpu(dep->inumber); #if XFS_BIG_INUMS - p->ino += mp->m_inoadd; + ino += mp->m_inoadd; #endif - p->name = (char *)dep->name; - - error = p->put(p); /* * Won't fit. Return to caller. */ - if (!p->done) { - eof = 0; + if (filldir(dirent, dep->name, dep->namelen, + xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, + ino, DT_UNKNOWN)) break; - } + /* * Advance to next entry in the block. */ ptr += length; curoff += length; + bufsize -= length; } /* * All done. Set output offset value to current offset. */ - *eofp = eof; - if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR)) - uio->uio_offset = XFS_DIR2_MAX_DATAPTR; + if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) + *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else - uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff); - kmem_free(map, map_size * sizeof(*map)); - kmem_free(p, sizeof(*p)); + *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + kmem_free(map); if (bp) - xfs_da_brelse(tp, bp); + xfs_da_brelse(NULL, bp); return error; } @@ -1160,7 +1144,7 @@ xfs_dir2_leaf_init( /* * Get the buffer for the block. */ - error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp, + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, XFS_DATA_FORK); if (error) { return error; @@ -1182,7 +1166,7 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); } @@ -1207,9 +1191,9 @@ xfs_dir2_leaf_log_bests( leaf = bp->data; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf); - firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first; - lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last; + ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); + firstb = xfs_dir2_leaf_bests_p(ltp) + first; + lastb = xfs_dir2_leaf_bests_p(ltp) + last; xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1269,7 +1253,7 @@ xfs_dir2_leaf_log_tail( mp = tp->t_mountp; leaf = bp->data; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); } @@ -1313,14 +1297,15 @@ xfs_dir2_leaf_lookup( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); /* - * Return the found inode number. + * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_da_brelse(tp, dbp); xfs_da_brelse(tp, lbp); - return XFS_ERROR(EEXIST); + return XFS_ERROR(error); } /* @@ -1336,8 +1321,8 @@ xfs_dir2_leaf_lookup_int( int *indexp, /* out: index in leaf block */ xfs_dabuf_t **dbpp) /* out: data buffer */ { - xfs_dir2_db_t curdb; /* current data block number */ - xfs_dabuf_t *dbp; /* data buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dabuf_t *dbp = NULL; /* data buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ @@ -1348,6 +1333,8 @@ xfs_dir2_leaf_lookup_int( xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t cidb = -1; /* case match data block no. */ + enum xfs_dacmp cmp; /* name compare result */ dp = args->dp; tp = args->trans; @@ -1355,11 +1342,10 @@ xfs_dir2_leaf_lookup_int( /* * Read the leaf block into the buffer. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, + XFS_DATA_FORK); + if (error) return error; - } *lbpp = lbp; leaf = lbp->data; xfs_dir2_leaf_check(dp, lbp); @@ -1371,9 +1357,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &leaf->ents[index], dbp = NULL, curdb = -1; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip over stale leaf entries. */ @@ -1382,7 +1368,7 @@ xfs_dir2_leaf_lookup_int( /* * Get the new data block number. */ - newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); /* * If it's not the same as the old data block number, * need to pitch the old one and read the new one. @@ -1390,10 +1376,10 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_da_brelse(tp, dbp); - if ((error = - xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp, XFS_DATA_FORK); + if (error) { xfs_da_brelse(tp, lbp); return error; } @@ -1403,24 +1389,50 @@ xfs_dir2_leaf_lookup_int( /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + dep = (xfs_dir2_data_entry_t *)((char *)dbp->data + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* - * If it matches then return it. + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { - *dbpp = dbp; + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; *indexp = index; - return 0; + /* case exact match: return the current buffer. */ + if (cmp == XFS_CMP_EXACT) { + *dbpp = dbp; + return 0; + } + cidb = curdb; } } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or remove). + * If a case-insensitive match was found earlier, re-read the + * appropriate data block if required and return it. + */ + if (args->cmpresult == XFS_CMP_CASE) { + ASSERT(cidb != -1); + if (cidb != curdb) { + xfs_da_brelse(tp, dbp); + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp, XFS_DATA_FORK); + if (error) { + xfs_da_brelse(tp, lbp); + return error; + } + } + *dbpp = dbp; + return 0; + } /* * No match found, return ENOENT. */ - ASSERT(args->oknoent); + ASSERT(cidb == -1); if (dbp) xfs_da_brelse(tp, dbp); xfs_da_brelse(tp, lbp); @@ -1470,24 +1482,24 @@ xfs_dir2_leaf_removename( * Point to the leaf entry, use that to point to the data entry. */ lep = &leaf->ents[index]; - db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *) - ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + ((char *)data + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); needscan = needlog = 0; oldbest = be16_to_cpu(data->hdr.bestfree[0].length); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); /* * Mark the former data entry unused. */ xfs_dir2_data_make_free(tp, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)data), - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ - be16_add(&leaf->hdr.stale, 1); + be16_add_cpu(&leaf->hdr.stale, 1); xfs_dir2_leaf_log_header(tp, lbp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); xfs_dir2_leaf_log_ents(tp, lbp, index, index); @@ -1496,7 +1508,7 @@ xfs_dir2_leaf_removename( * log the data block header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(mp, data, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); /* @@ -1548,7 +1560,7 @@ xfs_dir2_leaf_removename( */ memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); - be32_add(<p->bestcount, -(db - i)); + be32_add_cpu(<p->bestcount, -(db - i)); xfs_dir2_leaf_log_tail(tp, lbp); xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } else @@ -1603,7 +1615,7 @@ xfs_dir2_leaf_replace( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* * Put the new inode number in, log it. @@ -1699,7 +1711,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp, + if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, XFS_DATA_FORK))) { return error; } @@ -1713,7 +1725,7 @@ xfs_dir2_leaf_trim_data( */ leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) == mp->m_dirblksize - (uint)sizeof(data->hdr)); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); @@ -1728,8 +1740,8 @@ xfs_dir2_leaf_trim_data( /* * Eliminate the last bests entry from the table. */ - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); - be32_add(<p->bestcount, -1); + bestsp = xfs_dir2_leaf_bests_p(ltp); + be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); xfs_dir2_leaf_log_tail(tp, lbp); xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); @@ -1839,12 +1851,12 @@ xfs_dir2_node_to_leaf( /* * Set up the leaf tail from the freespace block. */ - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = free->hdr.nvalid; /* * Set up the leaf bests table. */ - memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests, + memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0])); xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); xfs_dir2_leaf_log_tail(tp, lbp); diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h index f57ca11..6c9539f 100644 --- a/fs/xfs/xfs_dir2_leaf.h +++ b/fs/xfs/xfs_dir2_leaf.h @@ -32,7 +32,7 @@ struct xfs_trans; #define XFS_DIR2_LEAF_SPACE 1 #define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) #define XFS_DIR2_LEAF_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET) + xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) /* * Offset in data space of a data entry. @@ -82,7 +82,6 @@ typedef struct xfs_dir2_leaf { * DB blocks here are logical directory block numbers, not filesystem blocks. */ -#define XFS_DIR2_MAX_LEAF_ENTS(mp) xfs_dir2_max_leaf_ents(mp) static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) { return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / @@ -92,7 +91,6 @@ static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) /* * Get address of the bestcount field in the single-leaf block. */ -#define XFS_DIR2_LEAF_TAIL_P(mp,lp) xfs_dir2_leaf_tail_p(mp, lp) static inline xfs_dir2_leaf_tail_t * xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) { @@ -104,7 +102,6 @@ xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) /* * Get address of the bests array in the single-leaf block. */ -#define XFS_DIR2_LEAF_BESTS_P(ltp) xfs_dir2_leaf_bests_p(ltp) static inline __be16 * xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) { @@ -114,7 +111,6 @@ xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) /* * Convert dataptr to byte in file space */ -#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp) static inline xfs_dir2_off_t xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) { @@ -124,7 +120,6 @@ xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) /* * Convert byte in file space to dataptr. It had better be aligned. */ -#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by) static inline xfs_dir2_dataptr_t xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) { @@ -134,7 +129,6 @@ xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) /* * Convert byte in space to (DB) block */ -#define XFS_DIR2_BYTE_TO_DB(mp,by) xfs_dir2_byte_to_db(mp, by) static inline xfs_dir2_db_t xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) { @@ -145,17 +139,15 @@ xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) /* * Convert dataptr to a block number */ -#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) xfs_dir2_dataptr_to_db(mp, dp) static inline xfs_dir2_db_t xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) { - return XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); + return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); } /* * Convert byte in space to offset in a block */ -#define XFS_DIR2_BYTE_TO_OFF(mp,by) xfs_dir2_byte_to_off(mp, by) static inline xfs_dir2_data_aoff_t xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) { @@ -166,18 +158,15 @@ xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) /* * Convert dataptr to a byte offset in a block */ -#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) xfs_dir2_dataptr_to_off(mp, dp) static inline xfs_dir2_data_aoff_t xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) { - return XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); + return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); } /* * Convert block and offset to byte in space */ -#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \ - xfs_dir2_db_off_to_byte(mp, db, o) static inline xfs_dir2_off_t xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, xfs_dir2_data_aoff_t o) @@ -189,7 +178,6 @@ xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, /* * Convert block (DB) to block (dablk) */ -#define XFS_DIR2_DB_TO_DA(mp,db) xfs_dir2_db_to_da(mp, db) static inline xfs_dablk_t xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) { @@ -199,29 +187,25 @@ xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) /* * Convert byte in space to (DA) block */ -#define XFS_DIR2_BYTE_TO_DA(mp,by) xfs_dir2_byte_to_da(mp, by) static inline xfs_dablk_t xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) { - return XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by)); + return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); } /* * Convert block and offset to dataptr */ -#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \ - xfs_dir2_db_off_to_dataptr(mp, db, o) static inline xfs_dir2_dataptr_t xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, xfs_dir2_data_aoff_t o) { - return XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o)); + return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); } /* * Convert block (dablk) to block (DB) */ -#define XFS_DIR2_DA_TO_DB(mp,da) xfs_dir2_da_to_db(mp, da) static inline xfs_dir2_db_t xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) { @@ -231,11 +215,10 @@ xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) /* * Convert block (dablk) to byte offset in space */ -#define XFS_DIR2_DA_TO_BYTE(mp,da) xfs_dir2_da_to_byte(mp, da) static inline xfs_dir2_off_t xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) { - return XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0); + return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); } /* @@ -249,9 +232,9 @@ extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); -extern int xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp, - struct uio *uio, int *eofp, - struct xfs_dirent *dbp, xfs_dir2_put_t put); +extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, + size_t bufsize, xfs_off_t *offset, + filldir_t filldir); extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, struct xfs_dabuf **bpp, int magic); extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 9ca7171..fa6c3a5 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -22,6 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -136,14 +137,14 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp, + if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, XFS_DATA_FORK))) { return error; } ASSERT(fbp != NULL); free = fbp->data; leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * Initialize the freespace block header. */ @@ -155,7 +156,7 @@ xfs_dir2_leaf_to_node( * Copy freespace entries from the leaf block to the new block. * Count active entries. */ - for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests; + for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; @@ -215,7 +216,7 @@ xfs_dir2_leafn_add( * a compact. */ - if (be16_to_cpu(leaf->hdr.count) == XFS_DIR2_MAX_LEAF_ENTS(mp)) { + if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) { if (!leaf->hdr.stale) return XFS_ERROR(ENOSPC); compact = be16_to_cpu(leaf->hdr.stale) > 1; @@ -225,7 +226,7 @@ xfs_dir2_leafn_add( ASSERT(index == be16_to_cpu(leaf->hdr.count) || be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; /* @@ -253,7 +254,7 @@ xfs_dir2_leafn_add( (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep)); lfloglow = index; lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add(&leaf->hdr.count, 1); + be16_add_cpu(&leaf->hdr.count, 1); } /* * There are stale entries. We'll use one for the new entry. @@ -321,13 +322,13 @@ xfs_dir2_leafn_add( lfloglow = MIN(index, lfloglow); lfloghigh = MAX(highstale, lfloghigh); } - be16_add(&leaf->hdr.stale, -1); + be16_add_cpu(&leaf->hdr.stale, -1); } /* * Insert the new entry, log everything. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, args->blkno, args->index)); xfs_dir2_leaf_log_header(tp, bp); xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); @@ -352,7 +353,7 @@ xfs_dir2_leafn_check( leaf = bp->data; mp = dp->i_mount; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); + ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { if (i + 1 < be16_to_cpu(leaf->hdr.count)) { ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= @@ -386,28 +387,26 @@ xfs_dir2_leafn_lasthash( } /* - * Look up a leaf entry in a node-format leaf block. - * If this is an addname then the extrablk in state is a freespace block, - * otherwise it's a data block. + * Look up a leaf entry for space to add a name in a node-format leaf block. + * The extrablk in state is a freespace block. */ -int -xfs_dir2_leafn_lookup_int( +STATIC int +xfs_dir2_leafn_lookup_for_addname( xfs_dabuf_t *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp; /* current data/free buffer */ - xfs_dir2_db_t curdb; /* current data block number */ - xfs_dir2_db_t curfdb; /* current free block number */ - xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_db_t curfdb = -1; /* current free block number */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int fi; /* free entry index */ - xfs_dir2_free_t *free=NULL; /* free block structure */ + xfs_dir2_free_t *free = NULL; /* free block structure */ int index; /* leaf entry index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ - int length=0; /* length of new data entry */ + int length; /* length of new data entry */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_db_t newdb; /* new data block number */ @@ -430,33 +429,20 @@ xfs_dir2_leafn_lookup_int( /* * Do we have a buffer coming in? */ - if (state->extravalid) + if (state->extravalid) { + /* If so, it's a free block buffer, get the block number. */ curbp = state->extrablk.bp; - else - curbp = NULL; - /* - * For addname, it's a free block buffer, get the block number. - */ - if (args->addname) { - curfdb = curbp ? state->extrablk.blkno : -1; - curdb = -1; - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); - if ((free = (curbp ? curbp->data : NULL))) - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); - } - /* - * For others, it's a data block buffer, get the block number. - */ - else { - curfdb = -1; - curdb = curbp ? state->extrablk.blkno : -1; + curfdb = state->extrablk.blkno; + free = curbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); } + length = xfs_dir2_data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -465,166 +451,249 @@ xfs_dir2_leafn_lookup_int( /* * Pull the data block number from the entry. */ - newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); /* * For addname, we're looking for a place to put the new entry. * We want to use a data block with an entry of equal * hash value to ours if there is one with room. + * + * If this block isn't the data block we already have + * in hand, take a look at it. */ - if (args->addname) { + if (newdb != curdb) { + curdb = newdb; /* - * If this block isn't the data block we already have - * in hand, take a look at it. + * Convert the data block to the free block + * holding its freespace information. */ - if (newdb != curdb) { - curdb = newdb; - /* - * Convert the data block to the free block - * holding its freespace information. - */ - newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb); - /* - * If it's not the one we have in hand, - * read it in. - */ - if (newfdb != curfdb) { - /* - * If we had one before, drop it. - */ - if (curbp) - xfs_da_brelse(tp, curbp); - /* - * Read the free block. - */ - if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, - newfdb), - -1, &curbp, - XFS_DATA_FORK))) { - return error; - } - free = curbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == - XFS_DIR2_FREE_MAGIC); - ASSERT((be32_to_cpu(free->hdr.firstdb) % - XFS_DIR2_MAX_FREE_BESTS(mp)) == - 0); - ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); - ASSERT(curdb < - be32_to_cpu(free->hdr.firstdb) + - be32_to_cpu(free->hdr.nvalid)); - } - /* - * Get the index for our entry. - */ - fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb); - /* - * If it has room, return it. - */ - if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) { - XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", - XFS_ERRLEVEL_LOW, mp); - if (curfdb != newfdb) - xfs_da_brelse(tp, curbp); - return XFS_ERROR(EFSCORRUPTED); - } - curfdb = newfdb; - if (be16_to_cpu(free->bests[fi]) >= length) { - *indexp = index; - state->extravalid = 1; - state->extrablk.bp = curbp; - state->extrablk.blkno = curfdb; - state->extrablk.index = fi; - state->extrablk.magic = - XFS_DIR2_FREE_MAGIC; - ASSERT(args->oknoent); - return XFS_ERROR(ENOENT); - } - } - } - /* - * Not adding a new entry, so we really want to find - * the name given to us. - */ - else { + newfdb = xfs_dir2_db_to_fdb(mp, newdb); /* - * If it's a different data block, go get it. + * If it's not the one we have in hand, read it in. */ - if (newdb != curdb) { + if (newfdb != curfdb) { /* - * If we had a block before, drop it. + * If we had one before, drop it. */ if (curbp) xfs_da_brelse(tp, curbp); /* - * Read the data block. + * Read the free block. */ - if ((error = - xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, newdb), -1, - &curbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newfdb), + -1, &curbp, XFS_DATA_FORK); + if (error) return error; - } - xfs_dir2_data_check(dp, curbp); - curdb = newdb; + free = curbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == + XFS_DIR2_FREE_MAGIC); + ASSERT((be32_to_cpu(free->hdr.firstdb) % + XFS_DIR2_MAX_FREE_BESTS(mp)) == 0); + ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); + ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + + be32_to_cpu(free->hdr.nvalid)); } /* - * Point to the data entry. + * Get the index for our entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)curbp->data + - XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + fi = xfs_dir2_db_to_fdindex(mp, curdb); /* - * Compare the entry, return it if it matches. + * If it has room, return it. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { - args->inumber = be64_to_cpu(dep->inumber); - *indexp = index; - state->extravalid = 1; - state->extrablk.bp = curbp; - state->extrablk.blkno = curdb; - state->extrablk.index = - (int)((char *)dep - - (char *)curbp->data); - state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - return XFS_ERROR(EEXIST); + if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) { + XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", + XFS_ERRLEVEL_LOW, mp); + if (curfdb != newfdb) + xfs_da_brelse(tp, curbp); + return XFS_ERROR(EFSCORRUPTED); } + curfdb = newfdb; + if (be16_to_cpu(free->bests[fi]) >= length) + goto out; } } + /* Didn't find any space */ + fi = -1; +out: + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (curbp) { + /* Giving back a free block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = fi; + state->extrablk.blkno = curfdb; + state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + } else { + state->extravalid = 0; + } /* - * Didn't find a match. - * If we are holding a buffer, give it back in case our caller - * finds it useful. + * Return the index, that will be the insertion point. */ - if ((state->extravalid = (curbp != NULL))) { - state->extrablk.bp = curbp; - state->extrablk.index = -1; + *indexp = index; + return XFS_ERROR(ENOENT); +} + +/* + * Look up a leaf entry in a node-format leaf block. + * The extrablk in state a data block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_entry( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); +#ifdef __KERNEL__ + ASSERT(be16_to_cpu(leaf->hdr.count) > 0); +#endif + xfs_dir2_leafn_check(dp, bp); + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + curbp = state->extrablk.bp; + curdb = state->extrablk.blkno; + } + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* - * For addname, giving back a free block. + * Skip stale leaf entries. */ - if (args->addname) { - state->extrablk.blkno = curfdb; - state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + /* + * Not adding a new entry, so we really want to find + * the name given to us. + * + * If it's a different data block, go get it. + */ + if (newdb != curdb) { + /* + * If we had a block before that we aren't saving + * for a CI name, drop it + */ + if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || + curdb != state->extrablk.blkno)) + xfs_da_brelse(tp, curbp); + /* + * If needing the block that is saved with a CI match, + * use it otherwise read in the new data block. + */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + newdb == state->extrablk.blkno) { + ASSERT(state->extravalid); + curbp = state->extrablk.bp; + } else { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &curbp, XFS_DATA_FORK); + if (error) + return error; + } + xfs_dir2_data_check(dp, curbp); + curdb = newdb; } /* - * For other callers, giving back a data block. + * Point to the data entry. */ - else { + dep = (xfs_dir2_data_entry_t *)((char *)curbp->data + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + /* + * Compare the entry and if it's an exact match, return + * EEXIST immediately. If it's the first case-insensitive + * match, store the block & inode number and continue looking. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + /* If there is a CI match block, drop it */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + curdb != state->extrablk.blkno) + xfs_da_brelse(tp, state->extrablk.bp); + args->cmpresult = cmp; + args->inumber = be64_to_cpu(dep->inumber); + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; state->extrablk.blkno = curdb; + state->extrablk.index = (int)((char *)dep - + (char *)curbp->data); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); } } - /* - * Return the final index, that will be the insertion point. - */ + ASSERT(index == be16_to_cpu(leaf->hdr.count) || + (args->op_flags & XFS_DA_OP_OKNOENT)); + if (curbp) { + if (args->cmpresult == XFS_CMP_DIFFERENT) { + /* Giving back last used data block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = -1; + state->extrablk.blkno = curdb; + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + } else { + /* If the curbp is not the CI match block, drop it */ + if (state->extrablk.bp != curbp) + xfs_da_brelse(tp, curbp); + } + } else { + state->extravalid = 0; + } *indexp = index; - ASSERT(index == be16_to_cpu(leaf->hdr.count) || args->oknoent); return XFS_ERROR(ENOENT); } /* + * Look up a leaf entry in a node-format leaf block. + * If this is an addname then the extrablk in state is a freespace block, + * otherwise it's a data block. + */ +int +xfs_dir2_leafn_lookup_int( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + if (args->op_flags & XFS_DA_OP_ADDNAME) + return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp, + state); + return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state); +} + +/* * Move count leaf entries from source to destination leaf. * Log entries and headers. Stale entries are preserved. */ @@ -696,10 +765,10 @@ xfs_dir2_leafn_moveents( /* * Update the headers and log them. */ - be16_add(&leaf_s->hdr.count, -(count)); - be16_add(&leaf_s->hdr.stale, -(stale)); - be16_add(&leaf_d->hdr.count, count); - be16_add(&leaf_d->hdr.stale, stale); + be16_add_cpu(&leaf_s->hdr.count, -(count)); + be16_add_cpu(&leaf_s->hdr.stale, -(stale)); + be16_add_cpu(&leaf_d->hdr.count, count); + be16_add_cpu(&leaf_d->hdr.stale, stale); xfs_dir2_leaf_log_header(tp, bp_s); xfs_dir2_leaf_log_header(tp, bp_d); xfs_dir2_leafn_check(args->dp, bp_s); @@ -822,9 +891,10 @@ xfs_dir2_leafn_rebalance( */ if (!state->inleaf) blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - - /* - * Finally sanity check just to make sure we are not returning a negative index + + /* + * Finally sanity check just to make sure we are not returning a + * negative index */ if(blk2->index < 0) { state->inleaf = 1; @@ -876,15 +946,15 @@ xfs_dir2_leafn_remove( /* * Extract the data block and offset from the entry. */ - db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); /* * Kill the leaf entry by marking it stale. * Log the leaf block changes. */ - be16_add(&leaf->hdr.stale, 1); + be16_add_cpu(&leaf->hdr.stale, 1); xfs_dir2_leaf_log_header(tp, bp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); xfs_dir2_leaf_log_ents(tp, bp, index, index); @@ -898,13 +968,13 @@ xfs_dir2_leafn_remove( longest = be16_to_cpu(data->hdr.bestfree[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(tp, dbp, off, - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(mp, data, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); xfs_dir2_data_check(dp, dbp); @@ -924,8 +994,8 @@ xfs_dir2_leafn_remove( * Convert the data block number to a free block, * read in the free block. */ - fdb = XFS_DIR2_DB_TO_FDB(mp, db); - if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), + fdb = xfs_dir2_db_to_fdb(mp, db); + if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, XFS_DATA_FORK))) { return error; } @@ -937,7 +1007,7 @@ xfs_dir2_leafn_remove( /* * Calculate which entry we need to fix. */ - findex = XFS_DIR2_DB_TO_FDINDEX(mp, db); + findex = xfs_dir2_db_to_fdindex(mp, db); longest = be16_to_cpu(data->hdr.bestfree[0].length); /* * If the data block is now empty we can get rid of it @@ -970,7 +1040,7 @@ xfs_dir2_leafn_remove( /* * One less used entry in the free table. */ - be32_add(&free->hdr.nused, -1); + be32_add_cpu(&free->hdr.nused, -1); xfs_dir2_free_log_header(tp, fbp); /* * If this was the last entry in the table, we can @@ -1073,7 +1143,7 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno), + error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno), &newblk->bp, XFS_DIR2_LEAFN_MAGIC); if (error) { return error; @@ -1331,7 +1401,7 @@ xfs_dir2_node_addname( /* * It worked, fix the hash values up the btree. */ - if (!args->justcheck) + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) xfs_da_fixhashpath(state, &state->path); } else { /* @@ -1385,7 +1455,7 @@ xfs_dir2_node_addname_int( dp = args->dp; mp = dp->i_mount; tp = args->trans; - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + length = xfs_dir2_data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1438,7 +1508,7 @@ xfs_dir2_node_addname_int( if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) return error; - lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo); + lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo); fbno = ifbno; } /* @@ -1474,7 +1544,7 @@ xfs_dir2_node_addname_int( * to avoid it. */ if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, + xfs_dir2_db_to_da(mp, fbno), -2, &fbp, XFS_DATA_FORK))) { return error; } @@ -1514,7 +1584,8 @@ xfs_dir2_node_addname_int( /* * Not allowed to allocate, return failure. */ - if (args->justcheck || args->total == 0) { + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { /* * Drop the freespace buffer unless it came from our * caller. @@ -1550,9 +1621,9 @@ xfs_dir2_node_addname_int( * Get the freespace block corresponding to the data block * that was just allocated. */ - fbno = XFS_DIR2_DB_TO_FDB(mp, dbno); + fbno = xfs_dir2_db_to_fdb(mp, dbno); if (unlikely(error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, + xfs_dir2_db_to_da(mp, fbno), -2, &fbp, XFS_DATA_FORK))) { xfs_da_buf_done(dbp); return error; @@ -1567,14 +1638,14 @@ xfs_dir2_node_addname_int( return error; } - if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) { + if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { cmn_err(CE_ALERT, "xfs_dir2_node_addname_int: dir ino " "%llu needed freesp block %lld for\n" " data block %lld, got %lld\n" " ifbno %llu lastfbno %d\n", (unsigned long long)dp->i_ino, - (long long)XFS_DIR2_DB_TO_FDB(mp, dbno), + (long long)xfs_dir2_db_to_fdb(mp, dbno), (long long)dbno, (long long)fbno, (unsigned long long)ifbno, lastfbno); if (fblk) { @@ -1598,7 +1669,7 @@ xfs_dir2_node_addname_int( * Get a buffer for the new block. */ if ((error = xfs_da_get_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), + xfs_dir2_db_to_da(mp, fbno), -1, &fbp, XFS_DATA_FORK))) { return error; } @@ -1623,7 +1694,7 @@ xfs_dir2_node_addname_int( /* * Set the freespace block index from the data block number. */ - findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno); + findex = xfs_dir2_db_to_fdindex(mp, dbno); /* * If it's after the end of the current entries in the * freespace block, extend that table. @@ -1641,7 +1712,7 @@ xfs_dir2_node_addname_int( * (this should always be true) then update the header. */ if (be16_to_cpu(free->bests[findex]) == NULLDATAOFF) { - be32_add(&free->hdr.nused, 1); + be32_add_cpu(&free->hdr.nused, 1); xfs_dir2_free_log_header(tp, fbp); } /* @@ -1660,7 +1731,7 @@ xfs_dir2_node_addname_int( /* * If just checking, we succeeded. */ - if (args->justcheck) { + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) xfs_da_buf_done(fbp); return 0; @@ -1669,7 +1740,7 @@ xfs_dir2_node_addname_int( * Read the data block in. */ if (unlikely( - error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno), + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), -1, &dbp, XFS_DATA_FORK))) { if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) xfs_da_buf_done(fbp); @@ -1698,14 +1769,14 @@ xfs_dir2_node_addname_int( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)data); xfs_dir2_data_log_entry(tp, dbp, dep); /* * Rescan the block for bestfree if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(mp, data, &needlog); /* * Log the data block header if needed. */ @@ -1766,6 +1837,14 @@ xfs_dir2_node_lookup( error = xfs_da_node_lookup_int(state, &rval); if (error) rval = error; + else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { + /* If a CI match, dup the actual name and return EEXIST */ + xfs_dir2_data_entry_t *dep; + + dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp-> + data + state->extrablk.index); + rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + } /* * Release the btree blocks and leaf block. */ @@ -1809,9 +1888,8 @@ xfs_dir2_node_removename( * Look up the entry we're deleting, set up the cursor. */ error = xfs_da_node_lookup_int(state, &rval); - if (error) { + if (error) rval = error; - } /* * Didn't find it, upper layer screwed up. */ @@ -1828,9 +1906,8 @@ xfs_dir2_node_removename( */ error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); - if (error) { + if (error) return error; - } /* * Fix the hash values up the btree. */ @@ -1904,7 +1981,7 @@ xfs_dir2_node_replace( ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); dep = (xfs_dir2_data_entry_t *) ((char *)data + - XFS_DIR2_DATAPTR_TO_OFF(state->mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. @@ -1980,7 +2057,7 @@ xfs_dir2_node_trim_free( * Blow the block away. */ if ((error = - xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo), + xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo), bp))) { /* * Can't fail with ENOSPC since that only happens with no diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h index c7c870e..dde72db 100644 --- a/fs/xfs/xfs_dir2_node.h +++ b/fs/xfs/xfs_dir2_node.h @@ -36,7 +36,7 @@ struct xfs_trans; #define XFS_DIR2_FREE_SPACE 2 #define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) #define XFS_DIR2_FREE_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET) + xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) #define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */ @@ -60,7 +60,6 @@ typedef struct xfs_dir2_free { /* * Convert data space db to the corresponding free db. */ -#define XFS_DIR2_DB_TO_FDB(mp,db) xfs_dir2_db_to_fdb(mp, db) static inline xfs_dir2_db_t xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) { @@ -70,7 +69,6 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) /* * Convert data space db to the corresponding index in a free db. */ -#define XFS_DIR2_DB_TO_FDINDEX(mp,db) xfs_dir2_db_to_fdindex(mp, db) static inline int xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) { diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 0cd77b1..a8a8a6e 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -22,6 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -89,8 +90,8 @@ xfs_dir2_block_sfsize( mp = dp->i_mount; count = i8count = namelen = 0; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Iterate over the block's data entries by using the leaf pointers. @@ -102,7 +103,7 @@ xfs_dir2_block_sfsize( * Calculate the pointer to the entry at hand. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); /* * Detect . and .., so we can special-case them. * . is not included in sf directories. @@ -124,7 +125,7 @@ xfs_dir2_block_sfsize( /* * Calculate the new size, see if we should give up yet. */ - size = XFS_DIR2_SF_HDR_SIZE(i8count) + /* header */ + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ count + /* namelen */ count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ namelen + /* name */ @@ -139,7 +140,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); + xfs_dir2_sf_put_inumber((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); return size; } @@ -199,15 +200,15 @@ xfs_dir2_block_to_sf( * Copy the header into the newly allocate local space. */ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count)); + memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); dp->i_d.di_size = size; /* * Set up to loop over the block's entries. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); ptr = (char *)block->u; - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + sfep = xfs_dir2_sf_firstentry(sfp); /* * Loop over the active and unused entries. * Stop when we reach the leaf/tail portion of the block. @@ -233,28 +234,28 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); /* * Normal entry, copy it into shortform. */ else { sfep->namelen = dep->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, + xfs_dir2_sf_put_offset(sfep, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)block)); memcpy(sfep->name, dep->name, dep->namelen); temp = be64_to_cpu(dep->inumber); - XFS_DIR2_SF_PUT_INUMBER(sfp, &temp, - XFS_DIR2_SF_INUMBERP(sfep)); - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + xfs_dir2_sf_put_inumber(sfp, &temp, + xfs_dir2_sf_inumberp(sfep)); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } - ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + ptr += xfs_dir2_data_entsize(dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(block, mp->m_dirblksize); + kmem_free(block); return error; } @@ -294,11 +295,11 @@ xfs_dir2_sf_addname( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Compute entry (and change in) size. */ - add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); + add_entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); incr_isize = add_entsize; objchange = 0; #if XFS_BIG_INUMS @@ -331,7 +332,7 @@ xfs_dir2_sf_addname( /* * Just checking or no space reservation, it doesn't fit. */ - if (args->justcheck || args->total == 0) + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to block form then add the name. @@ -344,7 +345,7 @@ xfs_dir2_sf_addname( /* * Just checking, it fits. */ - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; /* * Do it the easy way - just add it at the end. @@ -392,7 +393,7 @@ xfs_dir2_sf_addname_easy( /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen), + xfs_idata_realloc(dp, xfs_dir2_sf_entsize_byname(sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. @@ -403,10 +404,10 @@ xfs_dir2_sf_addname_easy( * Fill in the new entry. */ sfep->namelen = args->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); /* * Update the header and inode. */ @@ -463,14 +464,14 @@ xfs_dir2_sf_addname_hard( * If it's going to end up at the end then oldsfep will point there. */ for (offset = XFS_DIR2_DATA_FIRST_OFFSET, - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp), - add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen), + oldsfep = xfs_dir2_sf_firstentry(oldsfp), + add_datasize = xfs_dir2_data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep), + offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { - new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep); + new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) break; } @@ -495,10 +496,10 @@ xfs_dir2_sf_addname_hard( * Fill in the new entry, and update the header counts. */ sfep->namelen = args->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); sfp->hdr.count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) @@ -508,10 +509,10 @@ xfs_dir2_sf_addname_hard( * If there's more left to copy, do that. */ if (!eof) { - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } - kmem_free(buf, old_isize); + kmem_free(buf); dp->i_d.di_size = new_isize; xfs_dir2_sf_check(args); } @@ -544,9 +545,9 @@ xfs_dir2_sf_addname_pick( mp = dp->i_mount; sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - size = XFS_DIR2_DATA_ENTSIZE(args->namelen); + size = xfs_dir2_data_entsize(args->namelen); offset = XFS_DIR2_DATA_FIRST_OFFSET; - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* * Loop over sf entries. @@ -555,10 +556,10 @@ xfs_dir2_sf_addname_pick( */ for (i = 0; i < sfp->hdr.count; i++) { if (!holefit) - holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep); - offset = XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(sfep->namelen); - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); + offset = xfs_dir2_sf_get_offset(sfep) + + xfs_dir2_data_entsize(sfep->namelen); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this @@ -617,18 +618,18 @@ xfs_dir2_sf_check( sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; offset = XFS_DIR2_DATA_FIRST_OFFSET; - ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset); - ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); + ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = - XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(sfep->namelen); + xfs_dir2_sf_get_offset(sfep) + + xfs_dir2_data_entsize(sfep->namelen); } ASSERT(i8count == sfp->hdr.i8count); ASSERT(XFS_BIG_INUMS || i8count == 0); @@ -671,7 +672,7 @@ xfs_dir2_sf_create( ASSERT(dp->i_df.if_flags & XFS_IFINLINE); ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; - size = XFS_DIR2_SF_HDR_SIZE(i8count); + size = xfs_dir2_sf_hdr_size(i8count); /* * Make a buffer for the data. */ @@ -684,7 +685,7 @@ xfs_dir2_sf_create( /* * Now can put in the inode number, since i8count is set. */ - XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &pino, &sfp->hdr.parent); sfp->hdr.count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); @@ -695,19 +696,18 @@ xfs_dir2_sf_create( int /* error */ xfs_dir2_sf_getdents( xfs_inode_t *dp, /* incore directory inode */ - uio_t *uio, /* caller's buffer control */ - int *eofp, /* eof reached? (out) */ - xfs_dirent_t *dbp, /* caller's buffer */ - xfs_dir2_put_t put) /* abi's formatting function */ + void *dirent, + xfs_off_t *offset, + filldir_t filldir) { - int error; /* error return value */ int i; /* shortform entry number */ xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_dataptr_t off; /* current entry's offset */ - xfs_dir2_put_args_t p; /* arg package for put rtn */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_t *sfp; /* shortform structure */ - xfs_off_t dir_offset; + xfs_dir2_dataptr_t dot_offset; + xfs_dir2_dataptr_t dotdot_offset; + xfs_ino_t ino; mp = dp->i_mount; @@ -720,120 +720,86 @@ xfs_dir2_sf_getdents( return XFS_ERROR(EIO); } - dir_offset = uio->uio_offset; - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * If the block number in the offset is out of range, we're done. */ - if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) { - *eofp = 1; + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - } /* - * Set up putargs structure. + * Precalculate offsets for . and .. as we will always need them. + * + * XXX(hch): the second argument is sometimes 0 and sometimes + * mp->m_dirdatablk. */ - p.dbp = dbp; - p.put = put; - p.uio = uio; + dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOT_OFFSET); + dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOTDOT_OFFSET); + /* * Put . entry unless we're starting past it. */ - if (dir_offset <= - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET)) { - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0, - XFS_DIR2_DATA_DOTDOT_OFFSET); - p.ino = dp->i_ino; + if (*offset <= dot_offset) { + ino = dp->i_ino; #if XFS_BIG_INUMS - p.ino += mp->m_inoadd; + ino += mp->m_inoadd; #endif - p.name = "."; - p.namelen = 1; - - error = p.put(&p); - - if (!p.done) { - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET); - return error; + if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) { + *offset = dot_offset & 0x7fffffff; + return 0; } } /* * Put .. entry unless we're starting past it. */ - if (dir_offset <= - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET)) { - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_FIRST_OFFSET); - p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + if (*offset <= dotdot_offset) { + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); #if XFS_BIG_INUMS - p.ino += mp->m_inoadd; + ino += mp->m_inoadd; #endif - p.name = ".."; - p.namelen = 2; - - error = p.put(&p); - - if (!p.done) { - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET); - return error; + if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { + *offset = dotdot_offset & 0x7fffffff; + return 0; } } /* * Loop while there are more entries and put'ing works. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - - off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_SF_GET_OFFSET(sfep)); + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->hdr.count; i++) { + off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + xfs_dir2_sf_get_offset(sfep)); - if (dir_offset > off) + if (*offset > off) { + sfep = xfs_dir2_sf_nextentry(sfp, sfep); continue; + } - p.namelen = sfep->namelen; - - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(p.namelen)); - - p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); + ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); #if XFS_BIG_INUMS - p.ino += mp->m_inoadd; + ino += mp->m_inoadd; #endif - p.name = (char *)sfep->name; - error = p.put(&p); - - if (!p.done) { - uio->uio_offset = off; - return error; + if (filldir(dirent, sfep->name, sfep->namelen, + off & 0x7fffffff, ino, DT_UNKNOWN)) { + *offset = off & 0x7fffffff; + return 0; } + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } - /* - * They all fit. - */ - *eofp = 1; - - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); - + *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + 0x7fffffff; return 0; } @@ -847,8 +813,11 @@ xfs_dir2_sf_lookup( { xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ + int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_t *sfp; /* shortform structure */ + enum xfs_dacmp cmp; /* comparison result */ + xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ xfs_dir2_trace_args("sf_lookup", args); xfs_dir2_sf_check(args); @@ -865,12 +834,13 @@ xfs_dir2_sf_lookup( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Special case for . */ if (args->namelen == 1 && args->name[0] == '.') { args->inumber = dp->i_ino; + args->cmpresult = XFS_CMP_EXACT; return XFS_ERROR(EEXIST); } /* @@ -878,29 +848,42 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + args->cmpresult = XFS_CMP_EXACT; return XFS_ERROR(EEXIST); } /* * Loop over all the entries trying to match ours. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(args->name, sfep->name, args->namelen) == 0) { - args->inumber = - XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)); - return XFS_ERROR(EEXIST); + ci_sfep = NULL; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + /* + * Compare name and if it's an exact match, return the inode + * number. If it's the first case-insensitive match, store the + * inode number and continue looking for an exact match. + */ + cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, + sfep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + args->inumber = xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)); + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); + ci_sfep = sfep; } } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); /* - * Didn't find it. + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was not found, return ENOENT. */ - ASSERT(args->oknoent); - return XFS_ERROR(ENOENT); + if (!ci_sfep) + return XFS_ERROR(ENOENT); + /* otherwise process the CI match as required by the caller */ + error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); + return XFS_ERROR(error); } /* @@ -934,34 +917,31 @@ xfs_dir2_sf_removename( ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Loop over the old directory entries. * Find the one we're deleting. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(sfep->name, args->name, args->namelen) == 0) { - ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)) == - args->inumber); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { + ASSERT(xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)) == + args->inumber); break; } } /* * Didn't find it. */ - if (i == sfp->hdr.count) { + if (i == sfp->hdr.count) return XFS_ERROR(ENOENT); - } /* * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); + entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -1027,7 +1007,7 @@ xfs_dir2_sf_replace( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); #if XFS_BIG_INUMS /* * New inode number is large, and need to convert to 8-byte inodes. @@ -1067,28 +1047,27 @@ xfs_dir2_sf_replace( if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { #if XFS_BIG_INUMS || defined(DEBUG) - ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); ASSERT(args->inumber != ino); #endif - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, &sfp->hdr.parent); } /* * Normal entry, look for the name. */ else { - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(args->name, sfep->name, args->namelen) == 0) { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); + i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { #if XFS_BIG_INUMS || defined(DEBUG) - ino = XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)); + ino = xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)); ASSERT(args->inumber != ino); #endif - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); break; } } @@ -1096,7 +1075,7 @@ xfs_dir2_sf_replace( * Didn't find it. */ if (i == sfp->hdr.count) { - ASSERT(args->oknoent); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); #if XFS_BIG_INUMS if (i8elevated) xfs_dir2_sf_toino4(args); @@ -1189,27 +1168,27 @@ xfs_dir2_sf_toino4( */ sfp->hdr.count = oldsfp->hdr.count; sfp->hdr.i8count = 0; - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); /* * Copy the entries field by field. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, - XFS_DIR2_SF_INUMBERP(oldsfep)); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + ino = xfs_dir2_sf_get_inumber(oldsfp, + xfs_dir2_sf_inumberp(oldsfep)); + xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); } /* * Clean up the inode. */ - kmem_free(buf, oldsize); + kmem_free(buf); dp->i_d.di_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } @@ -1266,27 +1245,27 @@ xfs_dir2_sf_toino8( */ sfp->hdr.count = oldsfp->hdr.count; sfp->hdr.i8count = 1; - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); /* * Copy the entries field by field. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, - XFS_DIR2_SF_INUMBERP(oldsfep)); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + ino = xfs_dir2_sf_get_inumber(oldsfp, + xfs_dir2_sf_inumberp(oldsfep)); + xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); } /* * Clean up the inode. */ - kmem_free(buf, oldsize); + kmem_free(buf); dp->i_d.di_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h index 42f015b..deecc9d 100644 --- a/fs/xfs/xfs_dir2_sf.h +++ b/fs/xfs/xfs_dir2_sf.h @@ -62,7 +62,7 @@ typedef union { * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. * Only need 16 bits, this is the byte offset into the single block form. */ -typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t; +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; /* * The parent directory has a dedicated field, and the self-pointer must @@ -76,21 +76,20 @@ typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ xfs_dir2_inou_t parent; /* parent dir inode number */ -} xfs_dir2_sf_hdr_t; +} __arch_pack xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { __uint8_t namelen; /* actual name length */ xfs_dir2_sf_off_t offset; /* saved offset */ __uint8_t name[1]; /* name, variable size */ xfs_dir2_inou_t inumber; /* inode number, var. offset */ -} xfs_dir2_sf_entry_t; +} __arch_pack xfs_dir2_sf_entry_t; typedef struct xfs_dir2_sf { xfs_dir2_sf_hdr_t hdr; /* shortform header */ xfs_dir2_sf_entry_t list[1]; /* shortform entries */ } xfs_dir2_sf_t; -#define XFS_DIR2_SF_HDR_SIZE(i8count) xfs_dir2_sf_hdr_size(i8count) static inline int xfs_dir2_sf_hdr_size(int i8count) { return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \ @@ -98,14 +97,11 @@ static inline int xfs_dir2_sf_hdr_size(int i8count) ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); } -#define XFS_DIR2_SF_INUMBERP(sfep) xfs_dir2_sf_inumberp(sfep) static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep) { return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen]; } -#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \ - xfs_dir2_sf_get_inumber(sfp, from) static inline xfs_intino_t xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) { @@ -114,8 +110,6 @@ xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8)); } -#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \ - xfs_dir2_sf_put_inumber(sfp,from,to) static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, xfs_dir2_inou_t *to) { @@ -125,24 +119,18 @@ static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, XFS_PUT_DIR_INO8(*(from), (to)->i8); } -#define XFS_DIR2_SF_GET_OFFSET(sfep) \ - xfs_dir2_sf_get_offset(sfep) static inline xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) { return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i); } -#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \ - xfs_dir2_sf_put_offset(sfep,off) static inline void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) { INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off); } -#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) \ - xfs_dir2_sf_entsize_byname(sfp,len) static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) { return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \ @@ -150,8 +138,6 @@ static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); } -#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) \ - xfs_dir2_sf_entsize_byentry(sfp,sfep) static inline int xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) { @@ -160,19 +146,17 @@ xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); } -#define XFS_DIR2_SF_FIRSTENTRY(sfp) xfs_dir2_sf_firstentry(sfp) static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp) { return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count))); + ((char *)(sfp) + xfs_dir2_sf_hdr_size(sfp->hdr.i8count))); } -#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) xfs_dir2_sf_nextentry(sfp,sfep) static inline xfs_dir2_sf_entry_t * xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) { return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep))); + ((char *)(sfep) + xfs_dir2_sf_entsize_byentry(sfp,sfep))); } /* @@ -185,9 +169,8 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); -extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio, - int *eofp, struct xfs_dirent *dbp, - xfs_dir2_put_t put); +extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, + xfs_off_t *offset, filldir_t filldir); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c index f3fb2ff..6cc7c0c 100644 --- a/fs/xfs/xfs_dir2_trace.c +++ b/fs/xfs/xfs_dir2_trace.c @@ -85,7 +85,8 @@ xfs_dir2_trace_args( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, NULL, NULL); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + NULL, NULL); } void @@ -100,7 +101,7 @@ xfs_dir2_trace_args_b( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), (void *)(bp ? bp->bps[0] : NULL), NULL); } @@ -117,7 +118,7 @@ xfs_dir2_trace_args_bb( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), (void *)(lbp ? lbp->bps[0] : NULL), (void *)(dbp ? dbp->bps[0] : NULL)); } @@ -157,8 +158,8 @@ xfs_dir2_trace_args_db( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)db, - (void *)dbp); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + (void *)(long)db, (void *)dbp); } void @@ -173,7 +174,7 @@ xfs_dir2_trace_args_i( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), (void *)((unsigned long)(i >> 32)), (void *)((unsigned long)(i & 0xFFFFFFFF))); } @@ -190,7 +191,8 @@ xfs_dir2_trace_args_s( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + (void *)(long)s, NULL); } void @@ -208,7 +210,7 @@ xfs_dir2_trace_args_sb( (void *)((unsigned long)(args->inumber >> 32)), (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)s, - (void *)dbp); + (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), + (void *)(long)s, (void *)dbp); } #endif /* XFS_DIR2_TRACE */ diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index 4e7865a..2813cdd 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -18,7 +18,6 @@ #ifndef __XFS_DMAPI_H__ #define __XFS_DMAPI_H__ -#include <linux/version.h> /* Values used to define the on-disk version of dm_attrname_t. All * on-disk attribute names start with the 8-byte string "SGI_DMI_". * @@ -67,17 +66,15 @@ typedef enum { #define HAVE_DM_RIGHT_T /* Defines for determining if an event message should be sent. */ -#define DM_EVENT_ENABLED(vfsp, ip, event) ( \ - unlikely ((vfsp)->vfs_flag & VFS_DMI) && \ +#ifdef HAVE_DMAPI +#define DM_EVENT_ENABLED(ip, event) ( \ + unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \ ( ((ip)->i_d.di_dmevmask & (1 << event)) || \ ((ip)->i_mount->m_dmevmask & (1 << event)) ) \ ) - -#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \ - unlikely ((vfsp)->vfs_flag & VFS_DMI) && \ - ( ((io)->io_dmevmask & (1 << event)) || \ - ((io)->io_mount->m_dmevmask & (1 << event)) ) \ - ) +#else +#define DM_EVENT_ENABLED(ip, event) (0) +#endif #define DM_XFS_VALID_FS_EVENTS ( \ (1 << DM_EVENT_PREUNMOUNT) | \ @@ -157,27 +154,9 @@ typedef enum { #define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ /* - * Based on IO_ISDIRECT, decide which i_ flag is set. + * Pull in platform specific event flags defines */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_IMUX : 0) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) -#endif - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22)) -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_IMUX) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) -#endif - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21) -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - 0 : DM_FLAGS_IMUX) -#define DM_SEM_FLAG_WR (DM_FLAGS_IMUX) -#endif - +#include "xfs_dmapi_priv.h" /* * Macros to turn caller specified delay/block flags into @@ -186,9 +165,6 @@ typedef enum { #define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \ DM_FLAGS_NDELAY : 0) -#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) - - -extern struct bhv_module_vfsops xfs_dmops; +#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) #endif /* __XFS_DMAPI_H__ */ diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c index 1e4a35d..a1e55fb 100644 --- a/fs/xfs/xfs_dmops.c +++ b/fs/xfs/xfs_dmops.c @@ -19,18 +19,38 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_dmapi.h" +#include "xfs_inum.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_clnt.h" -xfs_dmops_t xfs_dmcore_stub = { + +static struct xfs_dmops xfs_dmcore_stub = { .xfs_send_data = (xfs_send_data_t)fs_nosys, .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr, .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys, .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys, - .xfs_send_unmount = (xfs_send_unmount_t)fs_noval, + .xfs_send_mount = (xfs_send_mount_t)fs_nosys, + .xfs_send_unmount = (xfs_send_unmount_t)fs_noerr, }; + +int +xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args) +{ + if (args->flags & XFSMNT_DMAPI) { + cmn_err(CE_WARN, + "XFS: dmapi support not available in this kernel."); + return EINVAL; + } + + mp->m_dm_ops = &xfs_dmcore_stub; + return 0; +} + +void +xfs_dmops_put(struct xfs_mount *mp) +{ +} diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index b95681b..f227ecd 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -22,6 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -57,22 +58,11 @@ xfs_error_trap(int e) } return e; } -#endif - -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; -void -xfs_error_test_init(void) -{ - memset(xfs_etest, 0, sizeof(xfs_etest)); - memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid)); - memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname)); -} - int xfs_error_test(int error_tag, int *fsidp, char *expression, int line, char *file, unsigned long randfactor) @@ -80,7 +70,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, int i; int64_t fsid; - if (random() % randfactor) + if (random32() % randfactor) return 0; memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); @@ -132,36 +122,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) } int -xfs_errortag_clear(int error_tag, xfs_mount_t *mp) +xfs_errortag_clearall(xfs_mount_t *mp, int loud) { - int i; int64_t fsid; + int cleared = 0; + int i; memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { - xfs_etest[i] = 0; - xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i], - strlen(xfs_etest_fsname[i]) + 1); - xfs_etest_fsname[i] = NULL; - cmn_err(CE_WARN, "Cleared XFS error tag #%d", - error_tag); - return 0; - } - } - - cmn_err(CE_WARN, "XFS error tag %d not on", error_tag); - - return 1; -} - -int -xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud) -{ - int i; - int cleared = 0; for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && @@ -171,8 +139,7 @@ xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud) xfs_etest[i]); xfs_etest[i] = 0; xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i], - strlen(xfs_etest_fsname[i]) + 1); + kmem_free(xfs_etest_fsname[i]); xfs_etest_fsname[i] = NULL; } } @@ -180,21 +147,11 @@ xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud) if (loud || cleared) cmn_err(CE_WARN, "Cleared all XFS error tags for filesystem \"%s\"", - fsname); + mp->m_fsname); return 0; } - -int -xfs_errortag_clearall(xfs_mount_t *mp) -{ - int64_t fsid; - - memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); - - return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1); -} -#endif /* DEBUG || INDUCE_IO_ERROR */ +#endif /* DEBUG */ static void xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) @@ -206,7 +163,7 @@ xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) newfmt = kmem_alloc(len, KM_SLEEP); sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt); icmn_err(level, newfmt, ap); - kmem_free(newfmt, len); + kmem_free(newfmt); } else { icmn_err(level, fmt, ap); } @@ -261,37 +218,6 @@ xfs_error_report( } } -STATIC void -xfs_hex_dump(void *p, int length) -{ - __uint8_t *uip = (__uint8_t*)p; - int i; - char sbuf[128], *s; - - s = sbuf; - *s = '\0'; - for (i=0; i<length; i++, uip++) { - if ((i % 16) == 0) { - if (*s != '\0') - cmn_err(CE_ALERT, "%s\n", sbuf); - s = sbuf; - sprintf(s, "0x%x: ", i); - while( *s != '\0') - s++; - } - sprintf(s, "%02x ", *uip); - - /* - * the kernel sprintf is a void; user sprintf returns - * the sprintf'ed string's length. Find the new end- - * of-string - */ - while( *s != '\0') - s++; - } - cmn_err(CE_ALERT, "%s\n", sbuf); -} - void xfs_corruption_error( char *tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index bc43163..11543f1 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -18,14 +18,6 @@ #ifndef __XFS_ERROR_H__ #define __XFS_ERROR_H__ -#define XFS_ERECOVER 1 /* Failure to recover log */ -#define XFS_ELOGSTAT 2 /* Failure to stat log in user space */ -#define XFS_ENOLOGSPACE 3 /* Reservation too large */ -#define XFS_ENOTSUP 4 /* Operation not supported */ -#define XFS_ENOLSN 5 /* Can't find the lsn you asked for */ -#define XFS_ENOTFOUND 6 -#define XFS_ENOTXFS 7 /* Not XFS filesystem */ - #ifdef DEBUG #define XFS_ERROR_NTRAP 10 extern int xfs_etrap[XFS_ERROR_NTRAP]; @@ -133,33 +125,22 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, #define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) +#ifdef DEBUG extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); -extern void xfs_error_test_init(void); #define XFS_NUM_INJECT_ERROR 10 - -#ifdef __ANSI_CPP__ -#define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ - xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \ - (rf))) -#else #define XFS_TEST_ERROR(expr, mp, tag, rf) \ ((expr) || \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf))) -#endif /* __ANSI_CPP__ */ extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clear(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clearall(xfs_mount_t *mp); -extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud); +extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) -#define xfs_errortag_clearall(mp) (ENOSYS) -#endif /* (DEBUG || INDUCE_IO_ERROR) */ +#define xfs_errortag_clearall(mp, loud) (ENOSYS) +#endif /* DEBUG */ /* * XFS panic tags -- allow a call to xfs_cmn_err() be turned into @@ -175,6 +156,7 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud); #define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 +#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 struct xfs_mount; /* PRINTFLIKE4 */ @@ -183,10 +165,12 @@ extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, /* PRINTFLIKE3 */ extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); +extern void xfs_hex_dump(void *p, int length); + #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) #define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? cmn_err(CE_WARN, "XFS: " fmt, ## args) : (void)0) + ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6cf6d87..8aa28f7 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -23,6 +23,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" @@ -33,9 +34,6 @@ kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); -STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *); -STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *); - void xfs_efi_item_free(xfs_efi_log_item_t *efip) @@ -43,8 +41,7 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip) int nexts = efip->efi_format.efi_nextents; if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - kmem_free(efip, sizeof(xfs_efi_log_item_t) + - (nexts - 1) * sizeof(xfs_extent_t)); + kmem_free(efip); } else { kmem_zone_free(xfs_efi_zone, efip); } @@ -112,19 +109,18 @@ STATIC void xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) { xfs_mount_t *mp; - SPLDECL(s); mp = efip->efi_item.li_mountp; - AIL_LOCK(mp, s); + spin_lock(&mp->m_ail_lock); if (efip->efi_flags & XFS_EFI_CANCELED) { /* * xfs_trans_delete_ail() drops the AIL lock. */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip); xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); } } @@ -140,10 +136,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) { xfs_mount_t *mp; xfs_log_item_desc_t *lidp; - SPLDECL(s); mp = efip->efi_item.li_mountp; - AIL_LOCK(mp, s); + spin_lock(&mp->m_ail_lock); if (efip->efi_flags & XFS_EFI_CANCELED) { /* * free the xaction descriptor pointing to this item @@ -154,11 +149,11 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) * pull the item off the AIL. * xfs_trans_delete_ail() drops the AIL lock. */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip); xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); } } @@ -184,7 +179,7 @@ STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *efip) { if (efip->efi_item.li_flags & XFS_LI_ABORTED) - xfs_efi_item_abort(efip); + xfs_efi_item_free(efip); return; } @@ -202,18 +197,6 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) } /* - * This is called when the transaction logging the EFI is aborted. - * Free up the EFI and return. No need to clean up the slot for - * the item in the transaction. That was done by the unpin code - * which is called prior to this routine in the abort/fs-shutdown path. - */ -STATIC void -xfs_efi_item_abort(xfs_efi_log_item_t *efip) -{ - xfs_efi_item_free(efip); -} - -/* * There isn't much you can do to push on an efi item. It is simply * stuck waiting for all of its corresponding efd items to be * committed to disk. @@ -242,7 +225,7 @@ xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) /* * This is the ops vector shared by all efi log items. */ -STATIC struct xfs_item_ops xfs_efi_item_ops = { +static struct xfs_item_ops xfs_efi_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efi_item_format, @@ -255,7 +238,6 @@ STATIC struct xfs_item_ops xfs_efi_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efi_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_efi_item_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efi_item_committing @@ -365,13 +347,12 @@ xfs_efi_release(xfs_efi_log_item_t *efip, { xfs_mount_t *mp; int extents_left; - SPLDECL(s); mp = efip->efi_item.li_mountp; ASSERT(efip->efi_next_extent > 0); ASSERT(efip->efi_flags & XFS_EFI_COMMITTED); - AIL_LOCK(mp, s); + spin_lock(&mp->m_ail_lock); ASSERT(efip->efi_next_extent >= nextents); efip->efi_next_extent -= nextents; extents_left = efip->efi_next_extent; @@ -379,37 +360,10 @@ xfs_efi_release(xfs_efi_log_item_t *efip, /* * xfs_trans_delete_ail() drops the AIL lock. */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip); xfs_efi_item_free(efip); } else { - AIL_UNLOCK(mp, s); - } -} - -/* - * This is called when the transaction that should be committing the - * EFD corresponding to the given EFI is aborted. The committed and - * canceled flags are used to coordinate the freeing of the EFI and - * the references by the transaction that committed it. - */ -STATIC void -xfs_efi_cancel( - xfs_efi_log_item_t *efip) -{ - xfs_mount_t *mp; - SPLDECL(s); - - mp = efip->efi_item.li_mountp; - AIL_LOCK(mp, s); - if (efip->efi_flags & XFS_EFI_COMMITTED) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_CANCELED; - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); } } @@ -419,8 +373,7 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp) int nexts = efdp->efd_format.efd_nextents; if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { - kmem_free(efdp, sizeof(xfs_efd_log_item_t) + - (nexts - 1) * sizeof(xfs_extent_t)); + kmem_free(efdp); } else { kmem_zone_free(xfs_efd_zone, efdp); } @@ -514,7 +467,7 @@ STATIC void xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) { if (efdp->efd_item.li_flags & XFS_LI_ABORTED) - xfs_efd_item_abort(efdp); + xfs_efd_item_free(efdp); return; } @@ -541,27 +494,6 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) } /* - * The transaction of which this EFD is a part has been aborted. - * Inform its companion EFI of this fact and then clean up after - * ourselves. No need to clean up the slot for the item in the - * transaction. That was done by the unpin code which is called - * prior to this routine in the abort/fs-shutdown path. - */ -STATIC void -xfs_efd_item_abort(xfs_efd_log_item_t *efdp) -{ - /* - * If we got a log I/O error, it's always the case that the LR with the - * EFI got unpinned and freed before the EFD got aborted. So don't - * reference the EFI at all in that case. - */ - if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) - xfs_efi_cancel(efdp->efd_efip); - - xfs_efd_item_free(efdp); -} - -/* * There isn't much you can do to push on an efd item. It is simply * stuck waiting for the log to be flushed to disk. */ @@ -589,7 +521,7 @@ xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) /* * This is the ops vector shared by all efd log items. */ -STATIC struct xfs_item_ops xfs_efd_item_ops = { +static struct xfs_item_ops xfs_efd_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efd_item_format, @@ -602,7 +534,6 @@ STATIC struct xfs_item_ops xfs_efd_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efd_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_efd_item_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efd_item_committing diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0ea45ed..2f049f6 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -33,14 +33,16 @@ typedef struct xfs_extent { * conversion routine. */ +#ifndef HAVE_FORMAT32 typedef struct xfs_extent_32 { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; + __uint64_t ext_start; + __uint32_t ext_len; } __attribute__((packed)) xfs_extent_32_t; +#endif typedef struct xfs_extent_64 { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; + __uint64_t ext_start; + __uint32_t ext_len; __uint32_t ext_pad; } xfs_extent_64_t; @@ -50,25 +52,27 @@ typedef struct xfs_extent_64 { * size is given by efi_nextents. */ typedef struct xfs_efi_log_format { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ __uint64_t efi_id; /* efi identifier */ xfs_extent_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_t; +#ifndef HAVE_FORMAT32 typedef struct xfs_efi_log_format_32 { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ __uint64_t efi_id; /* efi identifier */ xfs_extent_32_t efi_extents[1]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; +#endif typedef struct xfs_efi_log_format_64 { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ __uint64_t efi_id; /* efi identifier */ xfs_extent_64_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_64_t; @@ -79,25 +83,27 @@ typedef struct xfs_efi_log_format_64 { * size is given by efd_nextents; */ typedef struct xfs_efd_log_format { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_t; +#ifndef HAVE_FORMAT32 typedef struct xfs_efd_log_format_32 { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_32_t efd_extents[1]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; +#endif typedef struct xfs_efd_log_format_64 { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_64_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_64_t; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c new file mode 100644 index 0000000..f3bb75d --- /dev/null +++ b/fs/xfs/xfs_filestream.c @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_bmap_btree.h" +#include "xfs_inum.h" +#include "xfs_dir2.h" +#include "xfs_dir2_sf.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" +#include "xfs_utils.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" + +#ifdef XFS_FILESTREAMS_TRACE + +ktrace_t *xfs_filestreams_trace_buf; + +STATIC void +xfs_filestreams_trace( + xfs_mount_t *mp, /* mount point */ + int type, /* type of trace */ + const char *func, /* source function */ + int line, /* source line number */ + __psunsigned_t arg0, + __psunsigned_t arg1, + __psunsigned_t arg2, + __psunsigned_t arg3, + __psunsigned_t arg4, + __psunsigned_t arg5) +{ + ktrace_enter(xfs_filestreams_trace_buf, + (void *)(__psint_t)(type | (line << 16)), + (void *)func, + (void *)(__psunsigned_t)current_pid(), + (void *)mp, + (void *)(__psunsigned_t)arg0, + (void *)(__psunsigned_t)arg1, + (void *)(__psunsigned_t)arg2, + (void *)(__psunsigned_t)arg3, + (void *)(__psunsigned_t)arg4, + (void *)(__psunsigned_t)arg5, + NULL, NULL, NULL, NULL, NULL, NULL); +} + +#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) +#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) +#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) +#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) +#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) +#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) +#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ + xfs_filestreams_trace(mp, t, __func__, __LINE__, \ + (__psunsigned_t)a0, (__psunsigned_t)a1, \ + (__psunsigned_t)a2, (__psunsigned_t)a3, \ + (__psunsigned_t)a4, (__psunsigned_t)a5) + +#define TRACE_AG_SCAN(mp, ag, ag2) \ + TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); +#define TRACE_AG_PICK1(mp, max_ag, maxfree) \ + TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); +#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ + TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ + cnt, free, scan, flag) +#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ + TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) +#define TRACE_FREE(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) +#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) +#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) +#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ + TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) +#define TRACE_ORPHAN(mp, ip, ag) \ + TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); + + +#else +#define TRACE_AG_SCAN(mp, ag, ag2) +#define TRACE_AG_PICK1(mp, max_ag, maxfree) +#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) +#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) +#define TRACE_FREE(mp, ip, pip, ag, cnt) +#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) +#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) +#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) +#define TRACE_ORPHAN(mp, ip, ag) +#endif + +static kmem_zone_t *item_zone; + +/* + * Structure for associating a file or a directory with an allocation group. + * The parent directory pointer is only needed for files, but since there will + * generally be vastly more files than directories in the cache, using the same + * data structure simplifies the code with very little memory overhead. + */ +typedef struct fstrm_item +{ + xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ + xfs_inode_t *ip; /* inode self-pointer. */ + xfs_inode_t *pip; /* Parent directory inode pointer. */ +} fstrm_item_t; + + +/* + * Scan the AGs starting at startag looking for an AG that isn't in use and has + * at least minlen blocks free. + */ +static int +_xfs_filestream_pick_ag( + xfs_mount_t *mp, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) +{ + int err, trylock, nscan; + xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + struct xfs_perag *pag; + + /* 2% of an AG's blocks must be free for it to be chosen. */ + minfree = mp->m_sb.sb_agblocks / 50; + + ag = startag; + *agp = NULLAGNUMBER; + + /* For the first pass, don't sleep trying to init the per-AG. */ + trylock = XFS_ALLOC_FLAG_TRYLOCK; + + for (nscan = 0; 1; nscan++) { + + TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); + + pag = mp->m_perag + ag; + + if (!pag->pagf_init) { + err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + if (err && !trylock) + return err; + } + + /* Might fail sometimes during the 1st pass with trylock set. */ + if (!pag->pagf_init) + goto next_ag; + + /* Keep track of the AG with the most free blocks. */ + if (pag->pagf_freeblks > maxfree) { + maxfree = pag->pagf_freeblks; + max_ag = ag; + } + + /* + * The AG reference count does two things: it enforces mutual + * exclusion when examining the suitability of an AG in this + * loop, and it guards against two filestreams being established + * in the same AG as each other. + */ + if (xfs_filestream_get_ag(mp, ag) > 1) { + xfs_filestream_put_ag(mp, ag); + goto next_ag; + } + + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + + if (((minlen && longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + *agp = ag; + break; + } + + /* Drop the reference on this AG, it's not usable. */ + xfs_filestream_put_ag(mp, ag); +next_ag: + /* Move to the next AG, wrapping to AG 0 if necessary. */ + if (++ag >= mp->m_sb.sb_agcount) + ag = 0; + + /* If a full pass of the AGs hasn't been done yet, continue. */ + if (ag != startag) + continue; + + /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + if (trylock != 0) { + trylock = 0; + continue; + } + + /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + continue; + } + + /* + * Take the AG with the most free space, regardless of whether + * it's already in use by another filestream. + */ + if (max_ag != NULLAGNUMBER) { + xfs_filestream_get_ag(mp, max_ag); + TRACE_AG_PICK1(mp, max_ag, maxfree); + free = maxfree; + *agp = max_ag; + break; + } + + /* take AG 0 if none matched */ + TRACE_AG_PICK1(mp, max_ag, maxfree); + *agp = 0; + return 0; + } + + TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), + free, nscan, flags); + + return 0; +} + +/* + * Set the allocation group number for a file or a directory, updating inode + * references and per-AG references as appropriate. Must be called with the + * m_peraglock held in read mode. + */ +static int +_xfs_filestream_update_ag( + xfs_inode_t *ip, + xfs_inode_t *pip, + xfs_agnumber_t ag) +{ + int err = 0; + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t old_ag; + xfs_inode_t *old_pip; + + /* + * Either ip is a regular file and pip is a directory, or ip is a + * directory and pip is NULL. + */ + ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && + (pip->i_d.di_mode & S_IFDIR)) || + ((ip->i_d.di_mode & S_IFDIR) && !pip))); + + mp = ip->i_mount; + cache = mp->m_filestream; + + item = xfs_mru_cache_lookup(cache, ip->i_ino); + if (item) { + ASSERT(item->ip == ip); + old_ag = item->ag; + item->ag = ag; + old_pip = item->pip; + item->pip = pip; + xfs_mru_cache_done(cache); + + /* + * If the AG has changed, drop the old ref and take a new one, + * effectively transferring the reference from old to new AG. + */ + if (ag != old_ag) { + xfs_filestream_put_ag(mp, old_ag); + xfs_filestream_get_ag(mp, ag); + } + + /* + * If ip is a file and its pip has changed, drop the old ref and + * take a new one. + */ + if (pip && pip != old_pip) { + IRELE(old_pip); + IHOLD(pip); + } + + TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), + ag, xfs_filestream_peek_ag(mp, ag)); + return 0; + } + + item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); + if (!item) + return ENOMEM; + + item->ag = ag; + item->ip = ip; + item->pip = pip; + + err = xfs_mru_cache_insert(cache, ip->i_ino, item); + if (err) { + kmem_zone_free(item_zone, item); + return err; + } + + /* Take a reference on the AG. */ + xfs_filestream_get_ag(mp, ag); + + /* + * Take a reference on the inode itself regardless of whether it's a + * regular file or a directory. + */ + IHOLD(ip); + + /* + * In the case of a regular file, take a reference on the parent inode + * as well to ensure it remains in-core. + */ + if (pip) + IHOLD(pip); + + TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), + ag, xfs_filestream_peek_ag(mp, ag)); + + return 0; +} + +/* xfs_fstrm_free_func(): callback for freeing cached stream items. */ +STATIC void +xfs_fstrm_free_func( + unsigned long ino, + void *data) +{ + fstrm_item_t *item = (fstrm_item_t *)data; + xfs_inode_t *ip = item->ip; + int ref; + + ASSERT(ip->i_ino == ino); + + xfs_iflags_clear(ip, XFS_IFILESTREAM); + + /* Drop the reference taken on the AG when the item was added. */ + ref = xfs_filestream_put_ag(ip->i_mount, item->ag); + + ASSERT(ref >= 0); + TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, + xfs_filestream_peek_ag(ip->i_mount, item->ag)); + + /* + * _xfs_filestream_update_ag() always takes a reference on the inode + * itself, whether it's a file or a directory. Release it here. + * This can result in the inode being freed and so we must + * not hold any inode locks when freeing filesstreams objects + * otherwise we can deadlock here. + */ + IRELE(ip); + + /* + * In the case of a regular file, _xfs_filestream_update_ag() also + * takes a ref on the parent inode to keep it in-core. Release that + * too. + */ + if (item->pip) + IRELE(item->pip); + + /* Finally, free the memory allocated for the item. */ + kmem_zone_free(item_zone, item); +} + +/* + * xfs_filestream_init() is called at xfs initialisation time to set up the + * memory zone that will be used for filestream data structure allocation. + */ +int +xfs_filestream_init(void) +{ + item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); + if (!item_zone) + return -ENOMEM; +#ifdef XFS_FILESTREAMS_TRACE + xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS); +#endif + return 0; +} + +/* + * xfs_filestream_uninit() is called at xfs termination time to destroy the + * memory zone that was used for filestream data structure allocation. + */ +void +xfs_filestream_uninit(void) +{ +#ifdef XFS_FILESTREAMS_TRACE + ktrace_free(xfs_filestreams_trace_buf); +#endif + kmem_zone_destroy(item_zone); +} + +/* + * xfs_filestream_mount() is called when a file system is mounted with the + * filestream option. It is responsible for allocating the data structures + * needed to track the new file system's file streams. + */ +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + int err; + unsigned int lifetime, grp_count; + + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + lifetime = xfs_fstrm_centisecs * 10; + grp_count = 10; + + err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, + xfs_fstrm_free_func); + + return err; +} + +/* + * xfs_filestream_unmount() is called when a file system that was mounted with + * the filestream option is unmounted. It drains the data structures created + * to track the file system's file streams and frees all the memory that was + * allocated. + */ +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); +} + +/* + * If the mount point's m_perag array is going to be reallocated, all + * outstanding cache entries must be flushed to avoid accessing reference count + * addresses that have been freed. The call to xfs_filestream_flush() must be + * made inside the block that holds the m_peraglock in write mode to do the + * reallocation. + */ +void +xfs_filestream_flush( + xfs_mount_t *mp) +{ + xfs_mru_cache_flush(mp->m_filestream); +} + +/* + * Return the AG of the filestream the file or directory belongs to, or + * NULLAGNUMBER otherwise. + */ +xfs_agnumber_t +xfs_filestream_lookup_ag( + xfs_inode_t *ip) +{ + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t ag; + int ref; + + if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { + ASSERT(0); + return NULLAGNUMBER; + } + + cache = ip->i_mount->m_filestream; + item = xfs_mru_cache_lookup(cache, ip->i_ino); + if (!item) { + TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); + return NULLAGNUMBER; + } + + ASSERT(ip == item->ip); + ag = item->ag; + ref = xfs_filestream_peek_ag(ip->i_mount, ag); + xfs_mru_cache_done(cache); + + TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); + return ag; +} + +/* + * xfs_filestream_associate() should only be called to associate a regular file + * with its parent directory. Calling it with a child directory isn't + * appropriate because filestreams don't apply to entire directory hierarchies. + * Creating a file in a child directory of an existing filestream directory + * starts a new filestream with its own allocation group association. + * + * Returns < 0 on error, 0 if successful association occurred, > 0 if + * we failed to get an association because of locking issues. + */ +int +xfs_filestream_associate( + xfs_inode_t *pip, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t ag, rotorstep, startag; + int err = 0; + + ASSERT(pip->i_d.di_mode & S_IFDIR); + ASSERT(ip->i_d.di_mode & S_IFREG); + if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) + return -EINVAL; + + mp = pip->i_mount; + cache = mp->m_filestream; + down_read(&mp->m_peraglock); + + /* + * We have a problem, Houston. + * + * Taking the iolock here violates inode locking order - we already + * hold the ilock. Hence if we block getting this lock we may never + * wake. Unfortunately, that means if we can't get the lock, we're + * screwed in terms of getting a stream association - we can't spin + * waiting for the lock because someone else is waiting on the lock we + * hold and we cannot drop that as we are in a transaction here. + * + * Lucky for us, this inversion is rarely a problem because it's a + * directory inode that we are trying to lock here and that means the + * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is + * used. i.e. freeze, remount-ro, quotasync or unmount. + * + * So, if we can't get the iolock without sleeping then just give up + */ + if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { + up_read(&mp->m_peraglock); + return 1; + } + + /* If the parent directory is already in the cache, use its AG. */ + item = xfs_mru_cache_lookup(cache, pip->i_ino); + if (item) { + ASSERT(item->ip == pip); + ag = item->ag; + xfs_mru_cache_done(cache); + + TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); + err = _xfs_filestream_update_ag(ip, pip, ag); + + goto exit; + } + + /* + * Set the starting AG using the rotor for inode32, otherwise + * use the directory inode's AG. + */ + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + rotorstep = xfs_rotorstep; + startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } else + startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + + /* Pick a new AG for the parent inode starting at startag. */ + err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); + if (err || ag == NULLAGNUMBER) + goto exit_did_pick; + + /* Associate the parent inode with the AG. */ + err = _xfs_filestream_update_ag(pip, NULL, ag); + if (err) + goto exit_did_pick; + + /* Associate the file inode with the AG. */ + err = _xfs_filestream_update_ag(ip, pip, ag); + if (err) + goto exit_did_pick; + + TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); + +exit_did_pick: + /* + * If _xfs_filestream_pick_ag() returned a valid AG, remove the + * reference it took on it, since the file and directory will have taken + * their own now if they were successfully cached. + */ + if (ag != NULLAGNUMBER) + xfs_filestream_put_ag(mp, ag); + +exit: + xfs_iunlock(pip, XFS_IOLOCK_EXCL); + up_read(&mp->m_peraglock); + return -err; +} + +/* + * Pick a new allocation group for the current file and its file stream. This + * function is called by xfs_bmap_filestreams() with the mount point's per-ag + * lock held. + */ +int +xfs_filestream_new_ag( + xfs_bmalloca_t *ap, + xfs_agnumber_t *agp) +{ + int flags, err; + xfs_inode_t *ip, *pip = NULL; + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + xfs_extlen_t minlen; + fstrm_item_t *dir, *file; + xfs_agnumber_t ag = NULLAGNUMBER; + + ip = ap->ip; + mp = ip->i_mount; + cache = mp->m_filestream; + minlen = ap->alen; + *agp = NULLAGNUMBER; + + /* + * Look for the file in the cache, removing it if it's found. Doing + * this allows it to be held across the dir lookup that follows. + */ + file = xfs_mru_cache_remove(cache, ip->i_ino); + if (file) { + ASSERT(ip == file->ip); + + /* Save the file's parent inode and old AG number for later. */ + pip = file->pip; + ag = file->ag; + + /* Look for the file's directory in the cache. */ + dir = xfs_mru_cache_lookup(cache, pip->i_ino); + if (dir) { + ASSERT(pip == dir->ip); + + /* + * If the directory has already moved on to a new AG, + * use that AG as the new AG for the file. Don't + * forget to twiddle the AG refcounts to match the + * movement. + */ + if (dir->ag != file->ag) { + xfs_filestream_put_ag(mp, file->ag); + xfs_filestream_get_ag(mp, dir->ag); + *agp = file->ag = dir->ag; + } + + xfs_mru_cache_done(cache); + } + + /* + * Put the file back in the cache. If this fails, the free + * function needs to be called to tidy up in the same way as if + * the item had simply expired from the cache. + */ + err = xfs_mru_cache_insert(cache, ip->i_ino, file); + if (err) { + xfs_fstrm_free_func(ip->i_ino, file); + return err; + } + + /* + * If the file's AG was moved to the directory's new AG, there's + * nothing more to be done. + */ + if (*agp != NULLAGNUMBER) { + TRACE_MOVEAG(mp, ip, pip, + ag, xfs_filestream_peek_ag(mp, ag), + *agp, xfs_filestream_peek_ag(mp, *agp)); + return 0; + } + } + + /* + * If the file's parent directory is known, take its iolock in exclusive + * mode to prevent two sibling files from racing each other to migrate + * themselves and their parent to different AGs. + */ + if (pip) + xfs_ilock(pip, XFS_IOLOCK_EXCL); + + /* + * A new AG needs to be found for the file. If the file's parent + * directory is also known, it will be moved to the new AG as well to + * ensure that files created inside it in future use the new AG. + */ + ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; + flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | + (ap->low ? XFS_PICK_LOWSPACE : 0); + + err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); + if (err || *agp == NULLAGNUMBER) + goto exit; + + /* + * If the file wasn't found in the file cache, then its parent directory + * inode isn't known. For this to have happened, the file must either + * be pre-existing, or it was created long enough ago that its cache + * entry has expired. This isn't the sort of usage that the filestreams + * allocator is trying to optimise, so there's no point trying to track + * its new AG somehow in the filestream data structures. + */ + if (!pip) { + TRACE_ORPHAN(mp, ip, *agp); + goto exit; + } + + /* Associate the parent inode with the AG. */ + err = _xfs_filestream_update_ag(pip, NULL, *agp); + if (err) + goto exit; + + /* Associate the file inode with the AG. */ + err = _xfs_filestream_update_ag(ip, pip, *agp); + if (err) + goto exit; + + TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, + *agp, xfs_filestream_peek_ag(mp, *agp)); + +exit: + /* + * If _xfs_filestream_pick_ag() returned a valid AG, remove the + * reference it took on it, since the file and directory will have taken + * their own now if they were successfully cached. + */ + if (*agp != NULLAGNUMBER) + xfs_filestream_put_ag(mp, *agp); + else + *agp = 0; + + if (pip) + xfs_iunlock(pip, XFS_IOLOCK_EXCL); + + return err; +} + +/* + * Remove an association between an inode and a filestream object. + * Typically this is done on last close of an unlinked file. + */ +void +xfs_filestream_deassociate( + xfs_inode_t *ip) +{ + xfs_mru_cache_t *cache = ip->i_mount->m_filestream; + + xfs_mru_cache_delete(cache, ip->i_ino); +} diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h new file mode 100644 index 0000000..f655f7d --- /dev/null +++ b/fs/xfs/xfs_filestream.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FILESTREAM_H__ +#define __XFS_FILESTREAM_H__ + +#ifdef __KERNEL__ + +struct xfs_mount; +struct xfs_inode; +struct xfs_perag; +struct xfs_bmalloca; + +#ifdef XFS_FILESTREAMS_TRACE +#define XFS_FSTRM_KTRACE_INFO 1 +#define XFS_FSTRM_KTRACE_AGSCAN 2 +#define XFS_FSTRM_KTRACE_AGPICK1 3 +#define XFS_FSTRM_KTRACE_AGPICK2 4 +#define XFS_FSTRM_KTRACE_UPDATE 5 +#define XFS_FSTRM_KTRACE_FREE 6 +#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7 +#define XFS_FSTRM_KTRACE_ASSOCIATE 8 +#define XFS_FSTRM_KTRACE_MOVEAG 9 +#define XFS_FSTRM_KTRACE_ORPHAN 10 + +#define XFS_FSTRM_KTRACE_SIZE 16384 +extern ktrace_t *xfs_filestreams_trace_buf; + +#endif + +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +STATIC_INLINE int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + return atomic_read(&mp->m_perag[agno].pagf_fstrms); +} + +STATIC_INLINE int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); +} + +STATIC_INLINE int +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); +} + +/* allocation selection flags */ +typedef enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +} xfs_fstrm_alloc_t; + +/* prototypes for filestream.c */ +int xfs_filestream_init(void); +void xfs_filestream_uninit(void); +int xfs_filestream_mount(struct xfs_mount *mp); +void xfs_filestream_unmount(struct xfs_mount *mp); +void xfs_filestream_flush(struct xfs_mount *mp); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); +int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); +void xfs_filestream_deassociate(struct xfs_inode *ip); +int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); + + +/* filestreams for the inode? */ +STATIC_INLINE int +xfs_inode_is_filestream( + struct xfs_inode *ip) +{ + return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + xfs_iflags_test(ip, XFS_IFILESTREAM) || + (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); +} + +#endif /* __KERNEL__ */ + +#endif /* __XFS_FILESTREAM_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0f0ad15..01c0cc8 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -22,8 +22,6 @@ * SGI's XFS filesystem's major stuff (constants, structures) */ -#define XFS_NAME "xfs" - /* * Direct I/O attribute record used with XFS_IOC_DIOINFO * d_miniosz is the min xfer size, xfer size multiple and file seek offset @@ -68,6 +66,7 @@ struct fsxattr { #define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* @@ -240,6 +239,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ /* @@ -371,6 +372,9 @@ typedef struct xfs_fsop_attrlist_handlereq { typedef struct xfs_attr_multiop { __u32 am_opcode; +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ __s32 am_error; void __user *am_attrname; void __user *am_attrvalue; @@ -389,30 +393,13 @@ typedef struct xfs_fsop_attrmulti_handlereq { */ typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ - -#ifndef HAVE_FID -#define MAXFIDSZ 46 - -typedef struct fid { - __u16 fid_len; /* length of data in bytes */ - unsigned char fid_data[MAXFIDSZ]; /* data (fid_len worth) */ -} fid_t; -#endif - typedef struct xfs_fid { - __u16 xfs_fid_len; /* length of remainder */ - __u16 xfs_fid_pad; - __u32 xfs_fid_gen; /* generation number */ - __u64 xfs_fid_ino; /* 64 bits inode number */ + __u16 fid_len; /* length of remainder */ + __u16 fid_pad; + __u32 fid_gen; /* generation number */ + __u64 fid_ino; /* 64 bits inode number */ } xfs_fid_t; -typedef struct xfs_fid2 { - __u16 fid_len; /* length of remainder */ - __u16 fid_pad; /* padding, must be zero */ - __u32 fid_gen; /* generation number */ - __u64 fid_ino; /* inode number */ -} xfs_fid2_t; - typedef struct xfs_handle { union { __s64 align; /* force alignment of ha_fid */ @@ -422,15 +409,11 @@ typedef struct xfs_handle { } xfs_handle_t; #define ha_fsid ha_u._ha_fsid -#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.xfs_fid_pad \ +#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \ - (char *) &(handle)) \ - + (handle).ha_fid.xfs_fid_len) - -#define XFS_HANDLE_CMP(h1, h2) memcmp(h1, h2, sizeof(xfs_handle_t)) + + (handle).ha_fid.fid_len) -#define FSHSIZE sizeof(fsid_t) - -/* +/* * Flags for going down operation */ #define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -440,9 +423,13 @@ typedef struct xfs_handle { /* * ioctl commands that are used by Linux filesystems */ -#define XFS_IOC_GETXFLAGS _IOR('f', 1, long) -#define XFS_IOC_SETXFLAGS _IOW('f', 2, long) -#define XFS_IOC_GETVERSION _IOR('v', 1, long) +#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS +#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS +#define XFS_IOC_GETVERSION FS_IOC_GETVERSION +/* 32-bit compat counterparts */ +#define XFS_IOC32_GETXFLAGS FS_IOC32_GETFLAGS +#define XFS_IOC32_SETXFLAGS FS_IOC32_SETFLAGS +#define XFS_IOC32_GETVERSION FS_IOC32_GETVERSION /* * ioctl commands that replace IRIX fcntl()'s diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c064e72..84583cf 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -44,6 +44,7 @@ #include "xfs_trans_space.h" #include "xfs_rtalloc.h" #include "xfs_rw.h" +#include "xfs_filestream.h" /* * File system operations @@ -76,34 +77,38 @@ xfs_fs_geometry( if (new_version >= 3) { geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = - (XFS_SB_VERSION_HASATTR(&mp->m_sb) ? + (xfs_sb_version_hasattr(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR : 0) | - (XFS_SB_VERSION_HASNLINK(&mp->m_sb) ? + (xfs_sb_version_hasnlink(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_NLINK : 0) | - (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ? + (xfs_sb_version_hasquota(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | - (XFS_SB_VERSION_HASALIGN(&mp->m_sb) ? + (xfs_sb_version_hasalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | - (XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ? + (xfs_sb_version_hasdalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | - (XFS_SB_VERSION_HASSHARED(&mp->m_sb) ? + (xfs_sb_version_hasshared(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SHARED : 0) | - (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ? + (xfs_sb_version_hasextflgbit(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | - (XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ? + (xfs_sb_version_hasdirv2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | - (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? + (xfs_sb_version_hassector(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | - (XFS_SB_VERSION_HASATTR2(&mp->m_sb) ? + (xfs_sb_version_hasasciici(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | + (xfs_sb_version_haslazysbcount(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | + (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); - geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? + geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; geo->dirblocksize = mp->m_dirblksize; } if (new_version >= 4) { geo->flags |= - (XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ? + (xfs_sb_version_haslogv2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); geo->logsunit = mp->m_sb.sb_logsunit; } @@ -133,13 +138,14 @@ xfs_growfs_data_private( xfs_rfsblock_t nfree; xfs_agnumber_t oagcount; int pct; - xfs_sb_t *sbp; xfs_trans_t *tp; nb = in->newblocks; pct = in->imaxpct; if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + return error; dpct = pct - mp->m_sb.sb_imax_pct; error = xfs_read_buf(mp, mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), @@ -161,6 +167,7 @@ xfs_growfs_data_private( new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; if (nagcount > oagcount) { + xfs_filestream_flush(mp); down_write(&mp->m_peraglock); mp->m_perag = kmem_realloc(mp->m_perag, sizeof(xfs_perag_t) * nagcount, @@ -169,10 +176,11 @@ xfs_growfs_data_private( memset(&mp->m_perag[oagcount], 0, (nagcount - oagcount) * sizeof(xfs_perag_t)); mp->m_flags |= XFS_MOUNT_32BITINODES; - nagimax = xfs_initialize_perag(XFS_MTOVFS(mp), mp, nagcount); + nagimax = xfs_initialize_perag(mp, nagcount); up_write(&mp->m_peraglock); } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); + tp->t_flags |= XFS_TRANS_RESERVE; if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { xfs_trans_cancel(tp, 0); @@ -250,8 +258,7 @@ xfs_growfs_data_private( block->bb_numrecs = cpu_to_be16(1); block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, - block, 1, mp->m_alloc_mxr[0]); + arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -272,8 +279,7 @@ xfs_growfs_data_private( block->bb_numrecs = cpu_to_be16(1); block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, - block, 1, mp->m_alloc_mxr[0]); + arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -314,7 +320,7 @@ xfs_growfs_data_private( } ASSERT(bp); agi = XFS_BUF_TO_AGI(bp); - be32_add(&agi->agi_length, new); + be32_add_cpu(&agi->agi_length, new); ASSERT(nagcount == oagcount || be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); @@ -327,9 +333,10 @@ xfs_growfs_data_private( } ASSERT(bp); agf = XFS_BUF_TO_AGF(bp); - be32_add(&agf->agf_length, new); + be32_add_cpu(&agf->agf_length, new); ASSERT(be32_to_cpu(agf->agf_length) == be32_to_cpu(agi->agi_length)); + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); /* * Free the new space. */ @@ -348,7 +355,7 @@ xfs_growfs_data_private( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); if (error) { return error; } @@ -371,8 +378,7 @@ xfs_growfs_data_private( error, agno); break; } - sbp = XFS_BUF_TO_SBP(bp); - xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); /* * If we get an error writing out the alternate superblocks, * just issue a warning and continue. The real work is @@ -429,10 +435,10 @@ xfs_growfs_data( xfs_growfs_data_t *in) { int error; - if (!cpsema(&mp->m_growlock)) + if (!mutex_trylock(&mp->m_growlock)) return XFS_ERROR(EWOULDBLOCK); error = xfs_growfs_data_private(mp, in); - vsema(&mp->m_growlock); + mutex_unlock(&mp->m_growlock); return error; } @@ -442,10 +448,10 @@ xfs_growfs_log( xfs_growfs_log_t *in) { int error; - if (!cpsema(&mp->m_growlock)) + if (!mutex_trylock(&mp->m_growlock)) return XFS_ERROR(EWOULDBLOCK); error = xfs_growfs_log_private(mp, in); - vsema(&mp->m_growlock); + mutex_unlock(&mp->m_growlock); return error; } @@ -458,15 +464,13 @@ xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) { - unsigned long s; - - xfs_icsb_sync_counters_lazy(mp); - s = XFS_SB_LOCK(mp); + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + spin_lock(&mp->m_sb_lock); cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; cnt->freeino = mp->m_sb.sb_ifree; cnt->allocino = mp->m_sb.sb_icount; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); return 0; } @@ -491,30 +495,48 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta; + __int64_t lcounter, delta, fdblks_delta; __uint64_t request; - unsigned long s; /* If inval is null, report current values and return */ - if (inval == (__uint64_t *)NULL) { + if (!outval) + return EINVAL; outval->resblks = mp->m_resblks; outval->resblks_avail = mp->m_resblks_avail; return 0; } request = *inval; - s = XFS_SB_LOCK(mp); + + /* + * With per-cpu counters, this becomes an interesting + * problem. we needto work out if we are freeing or allocation + * blocks first, then we can do the modification as necessary. + * + * We do this under the m_sb_lock so that if we are near + * ENOSPC, we will hold out any changes while we work out + * what to do. This means that the amount of free space can + * change while we do this, so we need to retry if we end up + * trying to reserve more space than is available. + * + * We also use the xfs_mod_incore_sb() interface so that we + * don't have to care about whether per cpu counter are + * enabled, disabled or even compiled in.... + */ +retry: + spin_lock(&mp->m_sb_lock); + xfs_icsb_sync_counters_locked(mp, 0); /* * If our previous reservation was larger than the current value, * then move any unused blocks back to the free pool. */ - + fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ - mp->m_sb.sb_fdblocks += lcounter; + fdblks_delta = lcounter; mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; @@ -522,24 +544,48 @@ xfs_reserve_blocks( __int64_t free; free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + if (!free) + goto out; /* ENOSPC and fdblks_delta = 0 */ + delta = request - mp->m_resblks; lcounter = free - delta; if (lcounter < 0) { /* We can't satisfy the request, just get what we can */ mp->m_resblks += free; mp->m_resblks_avail += free; - mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp); + fdblks_delta = -free; } else { - mp->m_sb.sb_fdblocks = - lcounter + XFS_ALLOC_SET_ASIDE(mp); + fdblks_delta = -delta; mp->m_resblks = request; mp->m_resblks_avail += delta; } } +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + spin_unlock(&mp->m_sb_lock); - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - XFS_SB_UNLOCK(mp, s); + if (fdblks_delta) { + /* + * If we are putting blocks back here, m_resblks_avail is + * already at it's max so this will put it in the free pool. + * + * If we need space, we'll either succeed in getting it + * from the free block count or we'll get an enospc. If + * we get a ENOSPC, it means things changed while we were + * calculating fdblks_delta and so we should try again to + * see if there is anything left to reserve. + * + * Don't set the reserved flag here - we don't want to reserve + * the extra reserve blocks from the reserve..... + */ + int error; + error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); + if (error == ENOSPC) + goto retry; + } return 0; } @@ -563,7 +609,7 @@ xfs_fs_log_dummy( xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); - xfs_trans_commit(tp, 0, NULL); + xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -575,14 +621,13 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); - struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev); + struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); if (sb && !IS_ERR(sb)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); thaw_bdev(sb->s_bdev, sb); } - + break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 33164a8..aad8c5d 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -107,6 +107,16 @@ xfs_ialloc_log_di( /* * Allocation group level functions. */ +static inline int +xfs_ialloc_cluster_alignment( + xfs_alloc_arg_t *args) +{ + if (xfs_sb_version_hasalign(&args->mp->m_sb) && + args->mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) + return args->mp->m_sb.sb_inoalignmt; + return 1; +} /* * Allocate new inodes in the allocation group specified by agbp. @@ -123,6 +133,7 @@ xfs_ialloc_ag_alloc( int blks_per_cluster; /* fs blocks per inode cluster */ xfs_btree_cur_t *cur; /* inode btree cursor */ xfs_daddr_t d; /* disk addr of buffer */ + xfs_agnumber_t agno; int error; xfs_buf_t *fbuf; /* new free inodes' buffer */ xfs_dinode_t *free; /* new free inode structure */ @@ -136,6 +147,7 @@ xfs_ialloc_ag_alloc( int version; /* inode version number to use */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + unsigned int gen; args.tp = tp; args.mp = tp->t_mountp; @@ -166,10 +178,24 @@ xfs_ialloc_ag_alloc( args.mod = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; args.prod = 1; - args.alignment = 1; + /* - * Allow space for the inode btree to split. + * We need to take into account alignment here to ensure that + * we don't modify the free list if we fail to have an exact + * block. If we don't have an exact match, and every oher + * attempt allocation attempt fails, we'll end up cancelling + * a dirty transaction and shutting down. + * + * For an exact allocation, alignment must be 1, + * however we need to take cluster alignment into account when + * fixing up the freelist. Use the minalignslop field to + * indicate that extra blocks might be required for alignment, + * but not to use them in the actual exact allocation. */ + args.alignment = 1; + args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; + + /* Allow space for the inode btree to split. */ args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; if ((error = xfs_alloc_vextent(&args))) return error; @@ -190,13 +216,8 @@ xfs_ialloc_ag_alloc( ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); args.alignment = args.mp->m_dalign; isaligned = 1; - } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args.mp, - XFS_INODE_CLUSTER_SIZE(args.mp))) - args.alignment = args.mp->m_sb.sb_inoalignmt; - else - args.alignment = 1; + } else + args.alignment = xfs_ialloc_cluster_alignment(&args); /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. @@ -229,12 +250,7 @@ xfs_ialloc_ag_alloc( args.agbno = be32_to_cpu(agi->agi_root); args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno), args.agbno); - if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) - args.alignment = args.mp->m_sb.sb_inoalignmt; - else - args.alignment = 1; + args.alignment = xfs_ialloc_cluster_alignment(&args); if ((error = xfs_alloc_vextent(&args))) return error; } @@ -270,11 +286,19 @@ xfs_ialloc_ag_alloc( * use the old version so that old kernels will continue to be * able to use the file system. */ - if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb)) + if (xfs_sb_version_hasnlink(&args.mp->m_sb)) version = XFS_DINODE_VERSION_2; else version = XFS_DINODE_VERSION_1; + /* + * Seed the new inode cluster with a random generation number. This + * prevents short-term reuse of generation numbers if a chunk is + * freed and then immediately reallocated. We use random numbers + * rather than a linear progression to prevent the next generation + * number from being easily guessable. + */ + gen = random32(); for (j = 0; j < nbufs; j++) { /* * Get the block. @@ -292,25 +316,26 @@ xfs_ialloc_ag_alloc( xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { free = XFS_MAKE_IPTR(args.mp, fbuf, i); - INT_SET(free->di_core.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC); - INT_SET(free->di_core.di_version, ARCH_CONVERT, version); - INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO); + free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_core.di_version = version; + free->di_core.di_gen = cpu_to_be32(gen); + free->di_next_unlinked = cpu_to_be32(NULLAGINO); xfs_ialloc_log_di(tp, fbuf, i, XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED); } xfs_trans_inode_alloc_buf(tp, fbuf); } - be32_add(&agi->agi_count, newlen); - be32_add(&agi->agi_freecount, newlen); + be32_add_cpu(&agi->agi_count, newlen); + be32_add_cpu(&agi->agi_freecount, newlen); + agno = be32_to_cpu(agi->agi_seqno); down_read(&args.mp->m_peraglock); - args.mp->m_perag[be32_to_cpu(agi->agi_seqno)].pagi_freecount += newlen; + args.mp->m_perag[agno].pagi_freecount += newlen; up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); /* * Insert records describing the new inode chunk into the btree. */ - cur = xfs_btree_init_cursor(args.mp, tp, agbp, - be32_to_cpu(agi->agi_seqno), + cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno, XFS_BTNUM_INO, (xfs_inode_t *)0, 0); for (thisino = newino; thisino < newino + newlen; @@ -342,7 +367,7 @@ xfs_ialloc_ag_alloc( return 0; } -STATIC __inline xfs_agnumber_t +STATIC_INLINE xfs_agnumber_t xfs_ialloc_next_ag( xfs_mount_t *mp) { @@ -458,7 +483,7 @@ nextag: */ if (XFS_FORCED_SHUTDOWN(mp)) { up_read(&mp->m_peraglock); - return (xfs_buf_t *)0; + return NULL; } agno++; if (agno >= agcount) @@ -466,7 +491,7 @@ nextag: if (agno == pagno) { if (flags == 0) { up_read(&mp->m_peraglock); - return (xfs_buf_t *)0; + return NULL; } flags = 0; } @@ -529,10 +554,10 @@ xfs_dialloc( int offset; /* index of inode in chunk */ xfs_agino_t pagino; /* parent's a.g. relative inode # */ xfs_agnumber_t pagno; /* parent's allocation group number */ - xfs_inobt_rec_t rec; /* inode allocation record */ + xfs_inobt_rec_incore_t rec; /* inode allocation record */ xfs_agnumber_t tagno; /* testing allocation group number */ xfs_btree_cur_t *tcur; /* temp cursor */ - xfs_inobt_rec_t trec; /* temp inode allocation record */ + xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ if (*IO_agbp == NULL) { @@ -884,7 +909,7 @@ nextag: if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) goto error0; - be32_add(&agi->agi_freecount, -1); + be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); down_read(&mp->m_peraglock); mp->m_perag[tagno].pagi_freecount--; @@ -945,7 +970,7 @@ xfs_difree( int ilen; /* inodes in an inode cluster */ xfs_mount_t *mp; /* mount structure for filesystem */ int off; /* offset of inode in inode chunk */ - xfs_inobt_rec_t rec; /* btree record */ + xfs_inobt_rec_incore_t rec; /* btree record */ mp = tp->t_mountp; @@ -1052,7 +1077,7 @@ xfs_difree( /* * When an inode cluster is free, it becomes eligible for removal */ - if ((mp->m_flags & XFS_MOUNT_IDELETE) && + if (!(mp->m_flags & XFS_MOUNT_IKEEP) && (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { *delete = 1; @@ -1064,8 +1089,8 @@ xfs_difree( * to be freed when the transaction is committed. */ ilen = XFS_IALLOC_INODES(mp); - be32_add(&agi->agi_count, -ilen); - be32_add(&agi->agi_freecount, -(ilen - 1)); + be32_add_cpu(&agi->agi_count, -ilen); + be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); down_read(&mp->m_peraglock); mp->m_perag[agno].pagi_freecount -= ilen - 1; @@ -1094,7 +1119,7 @@ xfs_difree( /* * Change the inode free counts and log the ag/sb changes. */ - be32_add(&agi->agi_freecount, 1); + be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); down_read(&mp->m_peraglock); mp->m_perag[agno].pagi_freecount++; @@ -1195,6 +1220,7 @@ xfs_dilocate( "(0x%llx)", ino, XFS_AGINO_TO_INO(mp, agno, agino)); } + xfs_stack_trace(); #endif /* DEBUG */ return XFS_ERROR(EINVAL); } @@ -1386,6 +1412,7 @@ xfs_ialloc_read_agi( pag = &mp->m_perag[agno]; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); + pag->pagi_count = be32_to_cpu(agi->agi_count); pag->pagi_init = 1; } else { /* @@ -1409,3 +1436,23 @@ xfs_ialloc_read_agi( *bpp = bp; return 0; } + +/* + * Read in the agi to initialise the per-ag data in the mount structure + */ +int +xfs_ialloc_pagi_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno) /* allocation group number */ +{ + xfs_buf_t *bp = NULL; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 7f5debe..4e30ec1 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -30,14 +30,9 @@ struct xfs_trans; #define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks /* - * For small block file systems, move inodes in clusters of this size. - * When we don't have a lot of memory, however, we go a bit smaller - * to reduce the number of AGI and ialloc btree blocks we need to keep - * around for xfs_dilocate(). We choose which one to use in - * xfs_mount_int(). + * Move inodes in clusters of this size. */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 -#define XFS_INODE_SMALL_CLUSTER_SIZE 4096 #define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size /* @@ -149,6 +144,16 @@ xfs_ialloc_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp); /* allocation group hdr buf */ +/* + * Read in the allocation group header to initialise the per-ag data + * in the mount structure + */ +int +xfs_ialloc_pagi_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno); /* allocation group number */ + #endif /* __KERNEL__ */ #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 616eeeb..83502f3 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -181,7 +181,7 @@ xfs_inobt_delrec( * then we can get rid of this level. */ if (numrecs == 1 && level > 0) { - agbp = cur->bc_private.i.agbp; + agbp = cur->bc_private.a.agbp; agi = XFS_BUF_TO_AGI(agbp); /* * pp is still set to the first pointer in the block. @@ -189,12 +189,12 @@ xfs_inobt_delrec( */ bno = be32_to_cpu(agi->agi_root); agi->agi_root = *pp; - be32_add(&agi->agi_level, -1); + be32_add_cpu(&agi->agi_level, -1); /* * Free the block. */ if ((error = xfs_free_extent(cur->bc_tp, - XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1))) + XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1))) return error; xfs_trans_binval(cur->bc_tp, bp); xfs_ialloc_log_agi(cur->bc_tp, agbp, @@ -379,7 +379,7 @@ xfs_inobt_delrec( rrecs = be16_to_cpu(right->bb_numrecs); rbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, lbno, 0, &lbp, + cur->bc_private.a.agno, lbno, 0, &lbp, XFS_INO_BTREE_REF))) return error; left = XFS_BUF_TO_INOBT_BLOCK(lbp); @@ -401,7 +401,7 @@ xfs_inobt_delrec( lrecs = be16_to_cpu(left->bb_numrecs); lbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, rbno, 0, &rbp, + cur->bc_private.a.agno, rbno, 0, &rbp, XFS_INO_BTREE_REF))) return error; right = XFS_BUF_TO_INOBT_BLOCK(rbp); @@ -484,7 +484,7 @@ xfs_inobt_delrec( xfs_buf_t *rrbp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0, + cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, &rrbp, XFS_INO_BTREE_REF))) return error; rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); @@ -497,7 +497,7 @@ xfs_inobt_delrec( * Free the deleting block. */ if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp, - cur->bc_private.i.agno, rbno), 1))) + cur->bc_private.a.agno, rbno), 1))) return error; xfs_trans_binval(cur->bc_tp, rbp); /* @@ -568,7 +568,7 @@ xfs_inobt_insrec( /* * Make a key out of the record data to be inserted, and save it. */ - key.ir_startino = recp->ir_startino; /* INT_: direct copy */ + key.ir_startino = recp->ir_startino; optr = ptr = cur->bc_ptrs[level]; /* * If we're off the left edge, return failure. @@ -600,7 +600,7 @@ xfs_inobt_insrec( } #endif nbno = NULLAGBLOCK; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; /* * If the block is full, we can't insert the new entry until we * make the block un-full. @@ -641,7 +641,7 @@ xfs_inobt_insrec( return error; #endif ptr = cur->bc_ptrs[level]; - nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */ + nrec.ir_startino = nkey.ir_startino; } else { /* * Otherwise the insert fails. @@ -681,7 +681,7 @@ xfs_inobt_insrec( if ((error = xfs_btree_check_sptr(cur, *bnop, level))) return error; #endif - kp[ptr - 1] = key; /* INT_: struct copy */ + kp[ptr - 1] = key; pp[ptr - 1] = cpu_to_be32(*bnop); numrecs++; block->bb_numrecs = cpu_to_be16(numrecs); @@ -698,7 +698,7 @@ xfs_inobt_insrec( * Now stuff the new record in, bump numrecs * and log the new data. */ - rp[ptr - 1] = *recp; /* INT_: struct copy */ + rp[ptr - 1] = *recp; numrecs++; block->bb_numrecs = cpu_to_be16(numrecs); xfs_inobt_log_recs(cur, bp, ptr, numrecs); @@ -731,7 +731,7 @@ xfs_inobt_insrec( */ *bnop = nbno; if (nbno != NULLAGBLOCK) { - *recp = nrec; /* INT_: struct copy */ + *recp = nrec; *curp = ncur; } *stat = 1; @@ -854,7 +854,7 @@ xfs_inobt_lookup( { xfs_agi_t *agi; /* a.g. inode header */ - agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); + agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); agno = be32_to_cpu(agi->agi_seqno); agbno = be32_to_cpu(agi->agi_root); } @@ -878,7 +878,7 @@ xfs_inobt_lookup( */ bp = cur->bc_bufs[level]; if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; + bp = NULL; if (!bp) { /* * Need to get a new buffer. Read it, then @@ -950,12 +950,12 @@ xfs_inobt_lookup( xfs_inobt_key_t *kkp; kkp = kkbase + keyno - 1; - startino = INT_GET(kkp->ir_startino, ARCH_CONVERT); + startino = be32_to_cpu(kkp->ir_startino); } else { xfs_inobt_rec_t *krp; krp = krbase + keyno - 1; - startino = INT_GET(krp->ir_startino, ARCH_CONVERT); + startino = be32_to_cpu(krp->ir_startino); } /* * Compute difference to get next direction. @@ -1089,7 +1089,7 @@ xfs_inobt_lshift( * Set up the left neighbor as "left". */ if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib), + cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib), 0, &lbp, XFS_INO_BTREE_REF))) return error; left = XFS_BUF_TO_INOBT_BLOCK(lbp); @@ -1117,7 +1117,7 @@ xfs_inobt_lshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) return error; #endif - *lpp = *rpp; /* INT_: no-change copy */ + *lpp = *rpp; xfs_inobt_log_ptrs(cur, lbp, nrec, nrec); } /* @@ -1132,7 +1132,7 @@ xfs_inobt_lshift( /* * Bump and log left's numrecs, decrement and log right's numrecs. */ - be16_add(&left->bb_numrecs, 1); + be16_add_cpu(&left->bb_numrecs, 1); xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); #ifdef DEBUG if (level > 0) @@ -1140,7 +1140,7 @@ xfs_inobt_lshift( else xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); #endif - be16_add(&right->bb_numrecs, -1); + be16_add_cpu(&right->bb_numrecs, -1); xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); /* * Slide the contents of right down one entry. @@ -1160,7 +1160,7 @@ xfs_inobt_lshift( } else { memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ + key.ir_startino = rrp->ir_startino; rkp = &key; } /* @@ -1207,10 +1207,10 @@ xfs_inobt_newroot( /* * Get a block & a buffer. */ - agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); + agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, be32_to_cpu(agi->agi_root)); args.mod = args.minleft = args.alignment = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; @@ -1232,8 +1232,8 @@ xfs_inobt_newroot( * Set the root data in the a.g. inode structure. */ agi->agi_root = cpu_to_be32(args.agbno); - be32_add(&agi->agi_level, 1); - xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp, + be32_add_cpu(&agi->agi_level, 1); + xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); /* * At the previous root level there are now two blocks: the old @@ -1297,13 +1297,13 @@ xfs_inobt_newroot( */ kp = XFS_INOBT_KEY_ADDR(new, 1, cur); if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */ - kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */ + kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); + kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); } else { rp = XFS_INOBT_REC_ADDR(left, 1, cur); - INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT); + kp[0].ir_startino = rp->ir_startino; rp = XFS_INOBT_REC_ADDR(right, 1, cur); - INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT); + kp[1].ir_startino = rp->ir_startino; } xfs_inobt_log_keys(cur, nbp, 1, 2); /* @@ -1376,7 +1376,7 @@ xfs_inobt_rshift( * Set up the right neighbor as "right". */ if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), + cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, &rbp, XFS_INO_BTREE_REF))) return error; right = XFS_BUF_TO_INOBT_BLOCK(rbp); @@ -1410,8 +1410,8 @@ xfs_inobt_rshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) return error; #endif - *rkp = *lkp; /* INT_: no change copy */ - *rpp = *lpp; /* INT_: no change copy */ + *rkp = *lkp; + *rpp = *lpp; xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); } else { @@ -1420,15 +1420,15 @@ xfs_inobt_rshift( memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); *rrp = *lrp; xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ + key.ir_startino = rrp->ir_startino; rkp = &key; } /* * Decrement and log left's numrecs, bump and log right's numrecs. */ - be16_add(&left->bb_numrecs, -1); + be16_add_cpu(&left->bb_numrecs, -1); xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, 1); + be16_add_cpu(&right->bb_numrecs, 1); #ifdef DEBUG if (level > 0) xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); @@ -1492,7 +1492,7 @@ xfs_inobt_split( * Allocate the new block. * If we can't do it, we're toast. Give up. */ - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno); args.mod = args.minleft = args.alignment = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; @@ -1529,7 +1529,7 @@ xfs_inobt_split( */ if ((be16_to_cpu(left->bb_numrecs) & 1) && cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add(&right->bb_numrecs, 1); + be16_add_cpu(&right->bb_numrecs, 1); i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; /* * For non-leaf blocks, copy keys and addresses over to the new block. @@ -1559,13 +1559,13 @@ xfs_inobt_split( rrp = XFS_INOBT_REC_ADDR(right, 1, cur); memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */ + keyp->ir_startino = rrp->ir_startino; } /* * Find the left block number by looking in the buffer. * Adjust numrecs, sibling pointers. */ - be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); + be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); right->bb_rightsib = left->bb_rightsib; left->bb_rightsib = cpu_to_be32(args.agbno); right->bb_leftsib = cpu_to_be32(lbno); @@ -1725,7 +1725,7 @@ xfs_inobt_decrement( agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, agbno, 0, &bp, + cur->bc_private.a.agno, agbno, 0, &bp, XFS_INO_BTREE_REF))) return error; lev--; @@ -1813,9 +1813,9 @@ xfs_inobt_get_rec( * Point to the record and extract its data. */ rec = XFS_INOBT_REC_ADDR(block, ptr, cur); - *ino = INT_GET(rec->ir_startino, ARCH_CONVERT); - *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT); - *free = INT_GET(rec->ir_free, ARCH_CONVERT); + *ino = be32_to_cpu(rec->ir_startino); + *fcnt = be32_to_cpu(rec->ir_freecount); + *free = be64_to_cpu(rec->ir_free); *stat = 1; return 0; } @@ -1897,7 +1897,7 @@ xfs_inobt_increment( agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, agbno, 0, &bp, + cur->bc_private.a.agno, agbno, 0, &bp, XFS_INO_BTREE_REF))) return error; lev--; @@ -1930,10 +1930,10 @@ xfs_inobt_insert( level = 0; nbno = NULLAGBLOCK; - INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino); - INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount); - INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free); - ncur = (xfs_btree_cur_t *)0; + nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); + nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); + ncur = NULL; pcur = cur; /* * Loop going up the tree, starting at the leaf level. @@ -1965,7 +1965,7 @@ xfs_inobt_insert( */ if (ncur) { pcur = ncur; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; } } while (nbno != NULLAGBLOCK); *stat = i; @@ -2060,9 +2060,9 @@ xfs_inobt_update( /* * Fill in the new contents and log them. */ - INT_SET(rp->ir_startino, ARCH_CONVERT, ino); - INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt); - INT_SET(rp->ir_free, ARCH_CONVERT, free); + rp->ir_startino = cpu_to_be32(ino); + rp->ir_freecount = cpu_to_be32(fcnt); + rp->ir_free = cpu_to_be64(free); xfs_inobt_log_recs(cur, bp, ptr, ptr); /* * Updating first record in leaf. Pass new key value up to our parent. @@ -2070,7 +2070,7 @@ xfs_inobt_update( if (ptr == 1) { xfs_inobt_key_t key; /* key containing [ino] */ - INT_SET(key.ir_startino, ARCH_CONVERT, ino); + key.ir_startino = cpu_to_be32(ino); if ((error = xfs_inobt_updkey(cur, &key, 1))) return error; } diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index ae3904c..8efc4a5 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -47,19 +47,24 @@ static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) /* * Data record structure */ -typedef struct xfs_inobt_rec -{ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ __int32_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ -} xfs_inobt_rec_t; +} xfs_inobt_rec_incore_t; + /* * Key structure */ -typedef struct xfs_inobt_key -{ - xfs_agino_t ir_startino; /* starting inode number */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ } xfs_inobt_key_t; /* btree pointer type */ @@ -76,15 +81,12 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t; #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) #define XFS_INOBT_IS_FREE(rp,i) \ (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) -#define XFS_INOBT_IS_FREE_DISK(rp,i) \ - ((INT_GET((rp)->ir_free,ARCH_CONVERT) & XFS_INOBT_MASK(i)) != 0) #define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i)) #define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) /* * Real block structures have a size equal to the disk block size. */ -#define XFS_INOBT_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) #define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0]) #define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0]) #define XFS_INOBT_IS_LAST_REC(cur) \ @@ -105,14 +107,13 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t; * Record, key, and pointer address macros for btree blocks. */ #define XFS_INOBT_REC_ADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(0, cur))) + (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i)) + #define XFS_INOBT_KEY_ADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(1, cur))) + (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i)) #define XFS_INOBT_PTR_ADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \ + (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \ i, XFS_INOBT_BLOCK_MAXRECS(1, cur))) /* diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0724df7..e229e9e 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -40,132 +40,13 @@ #include "xfs_utils.h" /* - * Initialize the inode hash table for the newly mounted file system. - * Choose an initial table size based on user specified value, else - * use a simple algorithm using the maximum number of inodes as an - * indicator for table size, and clamp it between one and some large - * number of pages. - */ -void -xfs_ihash_init(xfs_mount_t *mp) -{ - __uint64_t icount; - uint i, flags = KM_SLEEP | KM_MAYFAIL; - - if (!mp->m_ihsize) { - icount = mp->m_maxicount ? mp->m_maxicount : - (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog); - mp->m_ihsize = 1 << max_t(uint, 8, - (xfs_highbit64(icount) + 1) / 2); - mp->m_ihsize = min_t(uint, mp->m_ihsize, - (64 * NBPP) / sizeof(xfs_ihash_t)); - } - - while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize * - sizeof(xfs_ihash_t), flags))) { - if ((mp->m_ihsize >>= 1) <= NBPP) - flags = KM_SLEEP; - } - for (i = 0; i < mp->m_ihsize; i++) { - rwlock_init(&(mp->m_ihash[i].ih_lock)); - } -} - -/* - * Free up structures allocated by xfs_ihash_init, at unmount time. - */ -void -xfs_ihash_free(xfs_mount_t *mp) -{ - kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t)); - mp->m_ihash = NULL; -} - -/* - * Initialize the inode cluster hash table for the newly mounted file system. - * Its size is derived from the ihash table size. - */ -void -xfs_chash_init(xfs_mount_t *mp) -{ - uint i; - - mp->m_chsize = max_t(uint, 1, mp->m_ihsize / - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)); - mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize); - mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize - * sizeof(xfs_chash_t), - KM_SLEEP); - for (i = 0; i < mp->m_chsize; i++) { - spinlock_init(&mp->m_chash[i].ch_lock,"xfshash"); - } -} - -/* - * Free up structures allocated by xfs_chash_init, at unmount time. - */ -void -xfs_chash_free(xfs_mount_t *mp) -{ - int i; - - for (i = 0; i < mp->m_chsize; i++) { - spinlock_destroy(&mp->m_chash[i].ch_lock); - } - - kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t)); - mp->m_chash = NULL; -} - -/* - * Try to move an inode to the front of its hash list if possible - * (and if its not there already). Called right after obtaining - * the list version number and then dropping the read_lock on the - * hash list in question (which is done right after looking up the - * inode in question...). - */ -STATIC void -xfs_ihash_promote( - xfs_ihash_t *ih, - xfs_inode_t *ip, - ulong version) -{ - xfs_inode_t *iq; - - if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) { - if (likely(version == ih->ih_version)) { - /* remove from list */ - if ((iq = ip->i_next)) { - iq->i_prevp = ip->i_prevp; - } - *ip->i_prevp = iq; - - /* insert at list head */ - iq = ih->ih_next; - iq->i_prevp = &ip->i_next; - ip->i_next = iq; - ip->i_prevp = &ih->ih_next; - ih->ih_next = ip; - } - write_unlock(&ih->ih_lock); - } -} - -/* * Look up an inode by number in the given file system. - * The inode is looked up in the hash table for the file system - * represented by the mount point parameter mp. Each bucket of - * the hash table is guarded by an individual semaphore. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, attach it to the provided + * vnode. * - * If the inode is found in the hash table, its corresponding vnode - * is obtained with a call to vn_get(). This call takes care of - * coordination with the reclamation of the inode and vnode. Note - * that the vmap structure is filled in while holding the hash lock. - * This gives us the state of the inode/vnode when we found it and - * is used for coordination in vn_get(). - * - * If it is not in core, read it in from the file system's device and - * add the inode into the hash table. + * If it is not in core, read it in from the file system's device, + * add it to the cache and attach the provided vnode. * * The inode is locked according to the value of the lock_flags parameter. * This flag parameter indicates how and if the inode's IO lock and inode lock @@ -184,7 +65,7 @@ xfs_ihash_promote( */ STATIC int xfs_iget_core( - bhv_vnode_t *vp, + struct inode *inode, xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, @@ -193,247 +74,199 @@ xfs_iget_core( xfs_inode_t **ipp, xfs_daddr_t bno) { - xfs_ihash_t *ih; + struct inode *old_inode; xfs_inode_t *ip; xfs_inode_t *iq; - bhv_vnode_t *inode_vp; - ulong version; int error; - /* REFERENCED */ - xfs_chash_t *ch; - xfs_chashlist_t *chl, *chlnew; - SPLDECL(s); + unsigned long first_index, mask; + xfs_perag_t *pag; + xfs_agino_t agino; + /* the radix tree exists only in inode capable AGs */ + if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) + return EINVAL; - ih = XFS_IHASH(mp, ino); + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_get_perag(mp, ino); + if (!pag->pagi_inodeok) + return EINVAL; + ASSERT(pag->pag_ici_init); + agino = XFS_INO_TO_AGINO(mp, ino); again: - read_lock(&ih->ih_lock); + read_lock(&pag->pag_ici_lock); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip != NULL) { + /* + * If INEW is set this inode is being set up + * we need to pause and try again. + */ + if (xfs_iflags_test(ip, XFS_INEW)) { + read_unlock(&pag->pag_ici_lock); + delay(1); + XFS_STATS_INC(xs_ig_frecycle); + + goto again; + } - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == ino) { + old_inode = ip->i_vnode; + if (old_inode == NULL) { /* - * If INEW is set this inode is being set up + * If IRECLAIM is set this inode is + * on its way out of the system, * we need to pause and try again. */ - if (ip->i_flags & XFS_INEW) { - read_unlock(&ih->ih_lock); + if (xfs_iflags_test(ip, XFS_IRECLAIM)) { + read_unlock(&pag->pag_ici_lock); delay(1); XFS_STATS_INC(xs_ig_frecycle); goto again; } - - inode_vp = XFS_ITOV_NULL(ip); - if (inode_vp == NULL) { - /* - * If IRECLAIM is set this inode is - * on its way out of the system, - * we need to pause and try again. - */ - if (ip->i_flags & XFS_IRECLAIM) { - read_unlock(&ih->ih_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } - - vn_trace_exit(vp, "xfs_iget.alloc", - (inst_t *)__return_address); - - XFS_STATS_INC(xs_ig_found); - - ip->i_flags &= ~XFS_IRECLAIMABLE; - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); - - XFS_MOUNT_ILOCK(mp); - list_del_init(&ip->i_reclaim); - XFS_MOUNT_IUNLOCK(mp); - - goto finish_inode; - - } else if (vp != inode_vp) { - struct inode *inode = vn_to_inode(inode_vp); - - /* The inode is being torn down, pause and - * try again. - */ - if (inode->i_state & (I_FREEING | I_CLEAR)) { - read_unlock(&ih->ih_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } -/* Chances are the other vnode (the one in the inode) is being torn - * down right now, and we landed on top of it. Question is, what do - * we do? Unhook the old inode and hook up the new one? - */ - cmn_err(CE_PANIC, - "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p", - inode_vp, vp); - } + ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE)); /* - * Inode cache hit: if ip is not at the front of - * its hash chain, move it there now. - * Do this with the lock held for update, but - * do statistics after releasing the lock. + * If lookup is racing with unlink, then we + * should return an error immediately so we + * don't remove it from the reclaim list and + * potentially leak the inode. */ - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); + if ((ip->i_d.di_mode == 0) && + !(flags & XFS_IGET_CREATE)) { + read_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); + return ENOENT; + } + + xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); + XFS_STATS_INC(xs_ig_found); + xfs_iflags_clear(ip, XFS_IRECLAIMABLE); + read_unlock(&pag->pag_ici_lock); -finish_inode: - if (ip->i_d.di_mode == 0) { - if (!(flags & IGET_CREATE)) - return ENOENT; - xfs_iocore_inode_reinit(ip); + XFS_MOUNT_ILOCK(mp); + list_del_init(&ip->i_reclaim); + XFS_MOUNT_IUNLOCK(mp); + + goto finish_inode; + + } else if (inode != old_inode) { + /* The inode is being torn down, pause and + * try again. + */ + if (old_inode->i_state & (I_FREEING | I_CLEAR)) { + read_unlock(&pag->pag_ici_lock); + delay(1); + XFS_STATS_INC(xs_ig_frecycle); + + goto again; } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); +/* Chances are the other vnode (the one in the inode) is being torn +* down right now, and we landed on top of it. Question is, what do +* we do? Unhook the old inode and hook up the new one? +*/ + cmn_err(CE_PANIC, + "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p", + old_inode, inode); + } - ip->i_flags &= ~XFS_ISTALE; + /* + * Inode cache hit + */ + read_unlock(&pag->pag_ici_lock); + XFS_STATS_INC(xs_ig_found); - vn_trace_exit(vp, "xfs_iget.found", - (inst_t *)__return_address); - goto return_ip; +finish_inode: + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + xfs_put_perag(mp, pag); + return ENOENT; } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE); + xfs_itrace_exit_tag(ip, "xfs_iget.found"); + goto return_ip; } /* - * Inode cache miss: save the hash chain version stamp and unlock - * the chain, so we don't deadlock in vn_alloc. + * Inode cache miss */ + read_unlock(&pag->pag_ici_lock); XFS_STATS_INC(xs_ig_missed); - version = ih->ih_version; - - read_unlock(&ih->ih_lock); - /* * Read the disk inode attributes into a new inode structure and get * a new vnode for it. This should also initialize i_ino and i_mount. */ - error = xfs_iread(mp, tp, ino, &ip, bno); + error = xfs_iread(mp, tp, ino, &ip, bno, + (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0); if (error) { + xfs_put_perag(mp, pag); return error; } - vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address); + xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); - xfs_inode_lock_init(ip, vp); - xfs_iocore_inode_init(ip); - if (lock_flags != 0) { - xfs_ilock(ip, lock_flags); - } - - if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) { - xfs_idestroy(ip); - return ENOENT; - } + mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + init_waitqueue_head(&ip->i_ipin_wait); + atomic_set(&ip->i_pincount, 0); /* - * Put ip on its hash chain, unless someone else hashed a duplicate - * after we released the hash lock. + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. */ - write_lock(&ih->ih_lock); + init_completion(&ip->i_flush); + complete(&ip->i_flush); - if (ih->ih_version != version) { - for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) { - if (iq->i_ino == ino) { - write_unlock(&ih->ih_lock); - xfs_idestroy(ip); + if (lock_flags) + xfs_ilock(ip, lock_flags); - XFS_STATS_INC(xs_ig_dup); - goto again; - } - } + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + xfs_idestroy(ip); + xfs_put_perag(mp, pag); + return ENOENT; } /* - * These values _must_ be set before releasing ihlock! + * Preload the radix tree so we can insert safely under the + * write spinlock. */ - ip->i_hash = ih; - if ((iq = ih->ih_next)) { - iq->i_prevp = &ip->i_next; + if (radix_tree_preload(GFP_KERNEL)) { + xfs_idestroy(ip); + delay(1); + goto again; } - ip->i_next = iq; - ip->i_prevp = &ih->ih_next; - ih->ih_next = ip; - ip->i_udquot = ip->i_gdquot = NULL; - ih->ih_version++; - ip->i_flags |= XFS_INEW; - - write_unlock(&ih->ih_lock); - + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + first_index = agino & mask; + write_lock(&pag->pag_ici_lock); /* - * put ip on its cluster's hash chain + * insert the new inode */ - ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL && - ip->i_cnext == NULL); - - chlnew = NULL; - ch = XFS_CHASH(mp, ip->i_blkno); - chlredo: - s = mutex_spinlock(&ch->ch_lock); - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - - /* insert this inode into the doubly-linked list - * where chl points */ - if ((iq = chl->chl_ip)) { - ip->i_cprev = iq->i_cprev; - iq->i_cprev->i_cnext = ip; - iq->i_cprev = ip; - ip->i_cnext = iq; - } else { - ip->i_cnext = ip; - ip->i_cprev = ip; - } - chl->chl_ip = ip; - ip->i_chash = chl; - break; - } - } - - /* no hash list found for this block; add a new hash list */ - if (chl == NULL) { - if (chlnew == NULL) { - mutex_spinunlock(&ch->ch_lock, s); - ASSERT(xfs_chashlist_zone != NULL); - chlnew = (xfs_chashlist_t *) - kmem_zone_alloc(xfs_chashlist_zone, - KM_SLEEP); - ASSERT(chlnew != NULL); - goto chlredo; - } else { - ip->i_cnext = ip; - ip->i_cprev = ip; - ip->i_chash = chlnew; - chlnew->chl_ip = ip; - chlnew->chl_blkno = ip->i_blkno; - if (ch->ch_list) - ch->ch_list->chl_prev = chlnew; - chlnew->chl_next = ch->ch_list; - chlnew->chl_prev = NULL; - ch->ch_list = chlnew; - chlnew = NULL; - } - } else { - if (chlnew != NULL) { - kmem_zone_free(xfs_chashlist_zone, chlnew); - } + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + BUG_ON(error != -EEXIST); + write_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + xfs_idestroy(ip); + XFS_STATS_INC(xs_ig_dup); + goto again; } - mutex_spinunlock(&ch->ch_lock, s); + /* + * These values _must_ be set before releasing the radix tree lock! + */ + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, XFS_INEW); + write_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); /* * Link ip to its mount and thread it on the mount's inode list. @@ -452,22 +285,27 @@ finish_inode: mp->m_inodes = ip; XFS_MOUNT_IUNLOCK(mp); + xfs_put_perag(mp, pag); return_ip: ASSERT(ip->i_df.if_ext_max == XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); - ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == - ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); - + xfs_iflags_set(ip, XFS_IMODIFIED); *ipp = ip; /* + * Set up the Linux with the Linux inode. + */ + ip->i_vnode = inode; + inode->i_private = ip; + + /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. */ - bhv_vfs_init_vnode(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1); - + if (ip->i_d.di_mode != 0) + xfs_setup_inode(ip); return 0; } @@ -487,68 +325,58 @@ xfs_iget( xfs_daddr_t bno) { struct inode *inode; - bhv_vnode_t *vp = NULL; + xfs_inode_t *ip; int error; XFS_STATS_INC(xs_ig_attempts); retry: - if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) { - xfs_inode_t *ip; - - vp = vn_from_inode(inode); - if (inode->i_state & I_NEW) { - vn_initialize(inode); - error = xfs_iget_core(vp, mp, tp, ino, flags, - lock_flags, ipp, bno); - if (error) { - vn_mark_bad(vp); - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - iput(inode); - } - } else { - /* - * If the inode is not fully constructed due to - * filehandle mismatches wait for the inode to go - * away and try again. - * - * iget_locked will call __wait_on_freeing_inode - * to wait for the inode to go away. - */ - if (is_bad_inode(inode) || - ((ip = xfs_vtoi(vp)) == NULL)) { - iput(inode); - delay(1); - goto retry; - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - XFS_STATS_INC(xs_ig_found); - *ipp = ip; - error = 0; + inode = iget_locked(mp->m_super, ino); + if (!inode) + /* If we got no inode we are out of memory */ + return ENOMEM; + + if (inode->i_state & I_NEW) { + XFS_STATS_INC(vn_active); + XFS_STATS_INC(vn_alloc); + + error = xfs_iget_core(inode, mp, tp, ino, flags, + lock_flags, ipp, bno); + if (error) { + make_bad_inode(inode); + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + iput(inode); } - } else - error = ENOMEM; /* If we got no inode we are out of memory */ + return error; + } - return error; -} + /* + * If the inode is not fully constructed due to + * filehandle mismatches wait for the inode to go + * away and try again. + * + * iget_locked will call __wait_on_freeing_inode + * to wait for the inode to go away. + */ + if (is_bad_inode(inode)) { + iput(inode); + delay(1); + goto retry; + } -/* - * Do the setup for the various locks within the incore inode. - */ -void -xfs_inode_lock_init( - xfs_inode_t *ip, - bhv_vnode_t *vp) -{ - mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", (long)vp->v_number); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number); - init_waitqueue_head(&ip->i_ipin_wait); - atomic_set(&ip->i_pincount, 0); - init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number); + ip = XFS_I(inode); + if (!ip) { + iput(inode); + delay(1); + goto retry; + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + XFS_STATS_INC(xs_ig_found); + *ipp = ip; + return 0; } /* @@ -561,32 +389,19 @@ xfs_inode_incore(xfs_mount_t *mp, xfs_ino_t ino, xfs_trans_t *tp) { - xfs_ihash_t *ih; xfs_inode_t *ip; - ulong version; - - ih = XFS_IHASH(mp, ino); - read_lock(&ih->ih_lock); - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == ino) { - /* - * If we find it and tp matches, return it. - * Also move it to the front of the hash list - * if we find it and it is not already there. - * Otherwise break from the loop and return - * NULL. - */ - if (ip->i_transp == tp) { - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); - return (ip); - } - break; - } - } - read_unlock(&ih->ih_lock); - return (NULL); + xfs_perag_t *pag; + + pag = xfs_get_perag(mp, ino); + read_lock(&pag->pag_ici_lock); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino)); + read_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); + + /* the returned inode must match the transaction */ + if (ip && (ip->i_transp != tp)) + return NULL; + return ip; } /* @@ -601,34 +416,32 @@ void xfs_iput(xfs_inode_t *ip, uint lock_flags) { - bhv_vnode_t *vp = XFS_ITOV(ip); - - vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address); + xfs_itrace_entry(ip); xfs_iunlock(ip, lock_flags); - VN_RELE(vp); + IRELE(ip); } /* * Special iput for brand-new inodes that are still locked */ void -xfs_iput_new(xfs_inode_t *ip, - uint lock_flags) +xfs_iput_new( + xfs_inode_t *ip, + uint lock_flags) { - bhv_vnode_t *vp = XFS_ITOV(ip); - struct inode *inode = vn_to_inode(vp); + struct inode *inode = VFS_I(ip); - vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address); + xfs_itrace_entry(ip); if ((ip->i_d.di_mode == 0)) { - ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE)); - vn_mark_bad(vp); + ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + make_bad_inode(inode); } if (inode->i_state & I_NEW) unlock_new_inode(inode); if (lock_flags) xfs_iunlock(ip, lock_flags); - VN_RELE(vp); + IRELE(ip); } @@ -641,8 +454,6 @@ xfs_iput_new(xfs_inode_t *ip, void xfs_ireclaim(xfs_inode_t *ip) { - bhv_vnode_t *vp; - /* * Remove from old hash list and mount list. */ @@ -671,14 +482,15 @@ xfs_ireclaim(xfs_inode_t *ip) /* * Pull our behavior descriptor from the vnode chain. */ - vp = XFS_ITOV_NULL(ip); - if (vp) { - vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); + if (ip->i_vnode) { + ip->i_vnode->i_private = NULL; + ip->i_vnode = NULL; } /* * Free all memory associated with the inode. */ + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_idestroy(ip); } @@ -691,58 +503,14 @@ void xfs_iextract( xfs_inode_t *ip) { - xfs_ihash_t *ih; + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); xfs_inode_t *iq; - xfs_mount_t *mp; - xfs_chash_t *ch; - xfs_chashlist_t *chl, *chm; - SPLDECL(s); - - ih = ip->i_hash; - write_lock(&ih->ih_lock); - if ((iq = ip->i_next)) { - iq->i_prevp = ip->i_prevp; - } - *ip->i_prevp = iq; - ih->ih_version++; - write_unlock(&ih->ih_lock); - /* - * Remove from cluster hash list - * 1) delete the chashlist if this is the last inode on the chashlist - * 2) unchain from list of inodes - * 3) point chashlist->chl_ip to 'chl_next' if to this inode. - */ - mp = ip->i_mount; - ch = XFS_CHASH(mp, ip->i_blkno); - s = mutex_spinlock(&ch->ch_lock); - - if (ip->i_cnext == ip) { - /* Last inode on chashlist */ - ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); - ASSERT(ip->i_chash != NULL); - chm=NULL; - chl = ip->i_chash; - if (chl->chl_prev) - chl->chl_prev->chl_next = chl->chl_next; - else - ch->ch_list = chl->chl_next; - if (chl->chl_next) - chl->chl_next->chl_prev = chl->chl_prev; - kmem_zone_free(xfs_chashlist_zone, chl); - } else { - /* delete one inode from a non-empty list */ - iq = ip->i_cnext; - iq->i_cprev = ip->i_cprev; - ip->i_cprev->i_cnext = iq; - if (ip->i_chash->chl_ip == ip) { - ip->i_chash->chl_ip = iq; - } - ip->i_chash = __return_address; - ip->i_cprev = __return_address; - ip->i_cnext = __return_address; - } - mutex_spinunlock(&ch->ch_lock, s); + write_lock(&pag->pag_ici_lock); + radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); + write_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); /* * Remove from mount's inode list. @@ -840,8 +608,9 @@ xfs_iunlock_map_shared( * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL */ void -xfs_ilock(xfs_inode_t *ip, - uint lock_flags) +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) { /* * You can't set both SHARED and EXCL for the same lock, @@ -852,18 +621,18 @@ xfs_ilock(xfs_inode_t *ip, (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - if (lock_flags & XFS_IOLOCK_EXCL) { - mrupdate(&ip->i_iolock); - } else if (lock_flags & XFS_IOLOCK_SHARED) { - mraccess(&ip->i_iolock); - } - if (lock_flags & XFS_ILOCK_EXCL) { - mrupdate(&ip->i_lock); - } else if (lock_flags & XFS_ILOCK_SHARED) { - mraccess(&ip->i_lock); - } xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); } @@ -878,15 +647,12 @@ xfs_ilock(xfs_inode_t *ip, * lock_flags -- this parameter indicates the inode's locks to be * to be locked. See the comment for xfs_ilock() for a list * of valid values. - * */ int -xfs_ilock_nowait(xfs_inode_t *ip, - uint lock_flags) +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) { - int iolocked; - int ilocked; - /* * You can't set both SHARED and EXCL for the same lock, * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, @@ -896,39 +662,32 @@ xfs_ilock_nowait(xfs_inode_t *ip, (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - iolocked = 0; if (lock_flags & XFS_IOLOCK_EXCL) { - iolocked = mrtryupdate(&ip->i_iolock); - if (!iolocked) { - return 0; - } + if (!mrtryupdate(&ip->i_iolock)) + goto out; } else if (lock_flags & XFS_IOLOCK_SHARED) { - iolocked = mrtryaccess(&ip->i_iolock); - if (!iolocked) { - return 0; - } + if (!mrtryaccess(&ip->i_iolock)) + goto out; } if (lock_flags & XFS_ILOCK_EXCL) { - ilocked = mrtryupdate(&ip->i_lock); - if (!ilocked) { - if (iolocked) { - mrunlock(&ip->i_iolock); - } - return 0; - } + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; } else if (lock_flags & XFS_ILOCK_SHARED) { - ilocked = mrtryaccess(&ip->i_lock); - if (!ilocked) { - if (iolocked) { - mrunlock(&ip->i_iolock); - } - return 0; - } + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; } xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address); return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; } /* @@ -944,8 +703,9 @@ xfs_ilock_nowait(xfs_inode_t *ip, * */ void -xfs_iunlock(xfs_inode_t *ip, - uint lock_flags) +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) { /* * You can't set both SHARED and EXCL for the same lock, @@ -956,34 +716,29 @@ xfs_iunlock(xfs_inode_t *ip, (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY | + XFS_LOCK_DEP_MASK)) == 0); ASSERT(lock_flags != 0); - if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) || - (ismrlocked(&ip->i_iolock, MR_ACCESS))); - ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) || - (ismrlocked(&ip->i_iolock, MR_UPDATE))); - mrunlock(&ip->i_iolock); - } + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); - if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) { - ASSERT(!(lock_flags & XFS_ILOCK_SHARED) || - (ismrlocked(&ip->i_lock, MR_ACCESS))); - ASSERT(!(lock_flags & XFS_ILOCK_EXCL) || - (ismrlocked(&ip->i_lock, MR_UPDATE))); - mrunlock(&ip->i_lock); + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) && + !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) { /* * Let the AIL know that this item has been unlocked in case * it is in the AIL and anyone is waiting on it. Don't do * this if the caller has asked us not to. */ - if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) && - ip->i_itemp != NULL) { - xfs_trans_unlocked_item(ip->i_mount, - (xfs_log_item_t*)(ip->i_itemp)); - } + xfs_trans_unlocked_item(ip->i_mount, + (xfs_log_item_t*)(ip->i_itemp)); } xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); } @@ -993,42 +748,45 @@ xfs_iunlock(xfs_inode_t *ip, * if it is being demoted. */ void -xfs_ilock_demote(xfs_inode_t *ip, - uint lock_flags) +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) { ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - if (lock_flags & XFS_ILOCK_EXCL) { - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); - } - if (lock_flags & XFS_IOLOCK_EXCL) { - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); - } } +#ifdef DEBUG /* - * The following three routines simply manage the i_flock - * semaphore embedded in the inode. This semaphore synchronizes - * processes attempting to flush the in-core inode back to disk. + * Debug-only routine, without additional rw_semaphore APIs, we can + * now only answer requests regarding whether we hold the lock for write + * (reader state is outside our visibility, we only track writer state). + * + * Note: this means !xfs_isilocked would give false positives, so don't do that. */ -void -xfs_iflock(xfs_inode_t *ip) -{ - psema(&(ip->i_flock), PINOD|PLTWAIT); -} - int -xfs_iflock_nowait(xfs_inode_t *ip) +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) { - return (cpsema(&(ip->i_flock))); -} + if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) == + XFS_ILOCK_EXCL) { + if (!ip->i_lock.mr_writer) + return 0; + } -void -xfs_ifunlock(xfs_inode_t *ip) -{ - ASSERT(issemalocked(&(ip->i_flock))); - vsema(&(ip->i_flock)); + if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) == + XFS_IOLOCK_EXCL) { + if (!ip->i_iolock.mr_writer) + return 0; + } + + return 1; } +#endif + diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f8ecff..a391b95 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -15,6 +15,8 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/log2.h> + #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -47,13 +49,12 @@ #include "xfs_utils.h" #include "xfs_dir2_trace.h" #include "xfs_quota.h" -#include "xfs_mac.h" #include "xfs_acl.h" - +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; -kmem_zone_t *xfs_chashlist_zone; /* * Used in xfs_itruncate(). This is the maximum number of extents @@ -66,7 +67,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); - #ifdef DEBUG /* * Make sure that the extents in the given memory buffer @@ -76,28 +76,23 @@ STATIC void xfs_validate_extents( xfs_ifork_t *ifp, int nrecs, - int disk, xfs_exntfmt_t fmt) { - xfs_bmbt_rec_t *ep; xfs_bmbt_irec_t irec; - xfs_bmbt_rec_t rec; + xfs_bmbt_rec_host_t rec; int i; for (i = 0; i < nrecs; i++) { - ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned((__uint64_t*)&ep->l0); - rec.l1 = get_unaligned((__uint64_t*)&ep->l1); - if (disk) - xfs_bmbt_disk_get_all(&rec, &irec); - else - xfs_bmbt_get_all(&rec, &irec); + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + rec.l0 = get_unaligned(&ep->l0); + rec.l1 = get_unaligned(&ep->l1); + xfs_bmbt_get_all(&rec, &irec); if (fmt == XFS_EXTFMT_NOSTATE) ASSERT(irec.br_state == XFS_EXT_NORM); } } #else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, disk, fmt) +#define xfs_validate_extents(ifp, nrecs, fmt) #endif /* DEBUG */ /* @@ -130,6 +125,90 @@ xfs_inobp_check( #endif /* + * Find the buffer associated with the given inode map + * We do basic validation checks on the buffer once it has been + * retrieved from disk. + */ +STATIC int +xfs_imap_to_bp( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_imap_t *imap, + xfs_buf_t **bpp, + uint buf_flags, + uint imap_flags) +{ + int error; + int i; + int ni; + xfs_buf_t *bp; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp); + if (error) { + if (error != EAGAIN) { + cmn_err(CE_WARN, + "xfs_imap_to_bp: xfs_trans_read_buf()returned " + "an error %d on %s. Returning error.", + error, mp->m_fsname); + } else { + ASSERT(buf_flags & XFS_BUF_TRYLOCK); + } + return error; + } + + /* + * Validate the magic number and version of every inode in the buffer + * (if DEBUG kernel) or the first inode in the buffer, otherwise. + */ +#ifdef DEBUG + ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; +#else /* usual case */ + ni = 1; +#endif + + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && + XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (imap_flags & XFS_IMAP_BULKSTAT) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } + XFS_CORRUPTION_ERROR("xfs_imap_to_bp", + XFS_ERRLEVEL_HIGH, mp, dip); +#ifdef DEBUG + cmn_err(CE_PANIC, + "Device %s - bad inode magic/vsn " + "daddr %lld #%d (magic=%x)", + XFS_BUFTARG_NAME(mp->m_ddev_targp), + (unsigned long long)imap->im_blkno, i, + be16_to_cpu(dip->di_core.di_magic)); +#endif + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + } + + xfs_inobp_check(mp, bp); + + /* + * Mark the buffer as an inode buffer now that it looks good + */ + XFS_BUF_SET_VTYPE(bp, B_FS_INO); + + *bpp = bp; + return 0; +} + +/* * This routine is called to map an inode number within a file * system to the buffer containing the on-disk version of the * inode. It returns a pointer to the buffer containing the @@ -151,72 +230,19 @@ xfs_inotobp( xfs_buf_t **bpp, int *offset) { - int di_ok; xfs_imap_t imap; xfs_buf_t *bp; int error; - xfs_dinode_t *dip; - /* - * Call the space management code to find the location of the - * inode on disk. - */ imap.im_blkno = 0; error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); - if (error != 0) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_imap() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); + if (error) return error; - } - - /* - * If the inode number maps to a block outside the bounds of the - * file system then return NULL rather than calling read_buf - * and panicing when we get an error from the driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { - cmn_err(CE_WARN, - "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds " - "of the file system %s. Returning EINVAL.", - (unsigned long long)imap.im_blkno, - imap.im_len, mp->m_fsname); - return XFS_ERROR(EINVAL); - } - /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); - - if (error) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_trans_read_buf() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0); + if (error) return error; - } - dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); - di_ok = - INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); - xfs_trans_brelse(tp, bp); - cmn_err(CE_WARN, - "xfs_inotobp: XFS_TEST_ERROR() returned an " - "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_inobp_check(mp, bp); - - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; *offset = imap.im_boffset; @@ -252,46 +278,21 @@ xfs_itobp( xfs_dinode_t **dipp, xfs_buf_t **bpp, xfs_daddr_t bno, - uint imap_flags) + uint imap_flags, + uint buf_flags) { xfs_imap_t imap; xfs_buf_t *bp; int error; - int i; - int ni; if (ip->i_blkno == (xfs_daddr_t)0) { - /* - * Call the space management code to find the location of the - * inode on disk. - */ imap.im_blkno = bno; - if ((error = xfs_imap(mp, tp, ip->i_ino, &imap, - XFS_IMAP_LOOKUP | imap_flags))) + error = xfs_imap(mp, tp, ip->i_ino, &imap, + XFS_IMAP_LOOKUP | imap_flags); + if (error) return error; /* - * If the inode number maps to a block outside the bounds - * of the file system then return NULL rather than calling - * read_buf and panicing when we get an error from the - * driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "(imap.im_blkno (0x%llx) " - "+ imap.im_len (0x%llx)) > " - " XFS_FSB_TO_BB(mp, " - "mp->m_sb.sb_dblocks) (0x%llx)", - (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len, - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); -#endif /* DEBUG */ - return XFS_ERROR(EINVAL); - } - - /* * Fill in the fields in the inode that will be used to * map the inode to its buffer from now on. */ @@ -309,76 +310,17 @@ xfs_itobp( } ASSERT(bno == 0 || bno == imap.im_blkno); - /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); - if (error) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "xfs_trans_read_buf() returned error %d, " - "imap.im_blkno 0x%llx, imap.im_len 0x%llx", - error, (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len); -#endif /* DEBUG */ + error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags); + if (error) return error; - } - - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - * No validation is done here in userspace (xfs_repair). - */ -#if !defined(__KERNEL__) - ni = 0; -#elif defined(DEBUG) - ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (imap_flags & XFS_IMAP_BULKSTAT) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } -#ifdef DEBUG - cmn_err(CE_ALERT, - "Device %s - bad inode magic/vsn " - "daddr %lld #%d (magic=%x)", - XFS_BUFTARG_NAME(mp->m_ddev_targp), - (unsigned long long)imap.im_blkno, i, - INT_GET(dip->di_core.di_magic, ARCH_CONVERT)); -#endif - XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, - mp, dip); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + if (!bp) { + ASSERT(buf_flags & XFS_BUF_TRYLOCK); + ASSERT(tp == NULL); + *bpp = NULL; + return EAGAIN; } - xfs_inobp_check(mp, bp); - - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); - - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; return 0; @@ -406,27 +348,26 @@ xfs_iformat( XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); error = 0; - if (unlikely( - INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + - INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > - INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { + if (unlikely(be32_to_cpu(dip->di_core.di_nextents) + + be16_to_cpu(dip->di_core.di_anextents) > + be64_to_cpu(dip->di_core.di_nblocks))) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", (unsigned long long)ip->i_ino, - (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) - + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), + (int)(be32_to_cpu(dip->di_core.di_nextents) + + be16_to_cpu(dip->di_core.di_anextents)), (unsigned long long) - INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)); + be64_to_cpu(dip->di_core.di_nblocks)); XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); return XFS_ERROR(EFSCORRUPTED); } - if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { + if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", (unsigned long long)ip->i_ino, - (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); + dip->di_core.di_forkoff); XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); return XFS_ERROR(EFSCORRUPTED); @@ -437,24 +378,25 @@ xfs_iformat( case S_IFCHR: case S_IFBLK: case S_IFSOCK: - if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) { + if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) { XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; - ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); + ip->i_size = 0; + ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev); break; case S_IFREG: case S_IFLNK: case S_IFDIR: - switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) { + switch (dip->di_core.di_format) { case XFS_DINODE_FMT_LOCAL: /* * no local regular files yet */ - if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { + if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt inode %Lu " "(local format for regular file).", @@ -465,7 +407,7 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } - di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); + di_size = be64_to_cpu(dip->di_core.di_size); if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt inode %Lu " @@ -507,7 +449,7 @@ xfs_iformat( ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) { + switch (dip->di_core.di_aformat) { case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); size = be16_to_cpu(atp->hdr.totsize); @@ -600,7 +542,7 @@ xfs_iformat_extents( xfs_dinode_t *dip, int whichfork) { - xfs_bmbt_rec_t *ep, *dp; + xfs_bmbt_rec_t *dp; xfs_ifork_t *ifp; int nex; int size; @@ -635,16 +577,13 @@ xfs_iformat_extents( ifp->if_bytes = size; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, 1, XFS_EXTFMT_INODE(ip)); + xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); for (i = 0; i < nex; i++, dp++) { - ep = xfs_iext_get_ext(ifp, i); - ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0), - ARCH_CONVERT); - ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), - ARCH_CONVERT); + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); } - xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, - whichfork); + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); if (whichfork != XFS_DATA_FORK || XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) if (unlikely(xfs_check_nostate_extents( @@ -718,70 +657,74 @@ xfs_iformat_btree( return 0; } -/* - * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk - * and native format - * - * buf = on-disk representation - * dip = native representation - * dir = direction - +ve -> disk to native - * -ve -> native to disk - */ void -xfs_xlate_dinode_core( - xfs_caddr_t buf, - xfs_dinode_core_t *dip, - int dir) +xfs_dinode_from_disk( + xfs_icdinode_t *to, + xfs_dinode_core_t *from) { - xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf; - xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip; - xfs_arch_t arch = ARCH_CONVERT; - - ASSERT(dir); - - INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch); - INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch); - INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch); - INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch); - INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch); - INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch); - INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch); - INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch); - INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch); - - if (dir > 0) { - memcpy(mem_core->di_pad, buf_core->di_pad, - sizeof(buf_core->di_pad)); - } else { - memcpy(buf_core->di_pad, mem_core->di_pad, - sizeof(buf_core->di_pad)); - } + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); + to->di_uid = be32_to_cpu(from->di_uid); + to->di_gid = be32_to_cpu(from->di_gid); + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid = be16_to_cpu(from->di_projid); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = be16_to_cpu(from->di_flushiter); + to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); + to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); + to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); + to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); + to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); + to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + to->di_size = be64_to_cpu(from->di_size); + to->di_nblocks = be64_to_cpu(from->di_nblocks); + to->di_extsize = be32_to_cpu(from->di_extsize); + to->di_nextents = be32_to_cpu(from->di_nextents); + to->di_anextents = be16_to_cpu(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); + to->di_gen = be32_to_cpu(from->di_gen); +} - INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch); - - INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec, - dir, arch); - INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec, - dir, arch); - INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec, - dir, arch); - INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec, - dir, arch); - INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec, - dir, arch); - INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec, - dir, arch); - INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch); - INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch); - INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch); - INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch); - INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch); - INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch); - INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch); - INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch); - INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch); - INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch); - INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch); +void +xfs_dinode_to_disk( + xfs_dinode_core_t *to, + xfs_icdinode_t *from) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid = cpu_to_be16(from->di_projid); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = cpu_to_be16(from->di_flushiter); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); + to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); + to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); + to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); + to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); + to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); } STATIC uint @@ -817,6 +760,8 @@ _xfs_dic2xflags( flags |= XFS_XFLAG_EXTSZINHERIT; if (di_flags & XFS_DIFLAG_NODEFRAG) flags |= XFS_XFLAG_NODEFRAG; + if (di_flags & XFS_DIFLAG_FILESTREAM) + flags |= XFS_XFLAG_FILESTREAM; } return flags; @@ -826,18 +771,20 @@ uint xfs_ip2xflags( xfs_inode_t *ip) { - xfs_dinode_core_t *dic = &ip->i_d; + xfs_icdinode_t *dic = &ip->i_d; return _xfs_dic2xflags(dic->di_flags) | - (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0); + (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); } uint xfs_dic2xflags( - xfs_dinode_core_t *dic) + xfs_dinode_t *dip) { - return _xfs_dic2xflags(INT_GET(dic->di_flags, ARCH_CONVERT)) | - (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0); + xfs_dinode_core_t *dic = &dip->di_core; + + return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) | + (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } /* @@ -854,7 +801,8 @@ xfs_iread( xfs_trans_t *tp, xfs_ino_t ino, xfs_inode_t **ipp, - xfs_daddr_t bno) + xfs_daddr_t bno, + uint imap_flags) { xfs_buf_t *bp; xfs_dinode_t *dip; @@ -866,6 +814,8 @@ xfs_iread( ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); ip->i_ino = ino; ip->i_mount = mp; + atomic_set(&ip->i_iocount, 0); + spin_lock_init(&ip->i_flags_lock); /* * Get pointer's to the on-disk inode and the buffer containing it. @@ -874,7 +824,7 @@ xfs_iread( * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will * know that this is a new incore inode. */ - error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, 0); + error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK); if (error) { kmem_zone_free(xfs_inode_zone, ip); return error; @@ -884,34 +834,37 @@ xfs_iread( * Initialize inode's trace buffers. * Do this before xfs_iformat in case it adds entries. */ +#ifdef XFS_INODE_TRACE + ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS); +#endif #ifdef XFS_BMAP_TRACE - ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); + ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_BMBT_TRACE - ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); + ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_RW_TRACE - ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); + ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_ILOCK_TRACE - ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); + ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS); #endif #ifdef XFS_DIR2_TRACE - ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); + ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif /* * If we got something that isn't an inode it means someone * (nfs or dmi) has a stale handle. */ - if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { + if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) { kmem_zone_free(xfs_inode_zone, ip); xfs_trans_brelse(tp, bp); #ifdef DEBUG xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " "dip->di_core.di_magic (0x%x) != " "XFS_DINODE_MAGIC (0x%x)", - INT_GET(dip->di_core.di_magic, ARCH_CONVERT), + be16_to_cpu(dip->di_core.di_magic), XFS_DINODE_MAGIC); #endif /* DEBUG */ return XFS_ERROR(EINVAL); @@ -925,8 +878,7 @@ xfs_iread( * Otherwise, just get the truly permanent information. */ if (dip->di_core.di_mode) { - xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, - &(ip->i_d), 1); + xfs_dinode_from_disk(&ip->i_d, &dip->di_core); error = xfs_iformat(ip, dip); if (error) { kmem_zone_free(xfs_inode_zone, ip); @@ -939,10 +891,10 @@ xfs_iread( return error; } } else { - ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT); - ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT); - ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT); - ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT); + ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic); + ip->i_d.di_version = dip->di_core.di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter); /* * Make sure to pull in the mode here as well in * case the inode is released without being used. @@ -979,6 +931,7 @@ xfs_iread( } ip->i_delayed_blks = 0; + ip->i_size = ip->i_d.di_size; /* * Mark the buffer containing the inode as something to keep @@ -1042,7 +995,7 @@ xfs_iread_extents( ifp->if_flags &= ~XFS_IFEXTENTS; return error; } - xfs_validate_extents(ifp, nextents, 0, XFS_EXTFMT_INODE(ip)); + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); return 0; } @@ -1071,6 +1024,11 @@ xfs_iread_extents( * also returns the [locked] bp pointing to the head of the freelist * as ialloc_context. The caller should hold this buffer across * the commit and pass it back into this routine on the second call. + * + * If we are allocating quota inodes, we do not have a parent inode + * to attach to or associate with (i.e. pip == NULL) because they + * are not linked into the directory structure - they are attached + * directly to the superblock - and so have no parent. */ int xfs_ialloc( @@ -1088,15 +1046,15 @@ xfs_ialloc( { xfs_ino_t ino; xfs_inode_t *ip; - bhv_vnode_t *vp; uint flags; int error; + timespec_t tv; /* * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip->i_ino, mode, okalloc, + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, ialloc_context, call_again, &ino); if (error != 0) { return error; @@ -1113,19 +1071,18 @@ xfs_ialloc( * to prevent others from looking at until we're done. */ error = xfs_trans_iget(tp->t_mountp, tp, ino, - IGET_CREATE, XFS_ILOCK_EXCL, &ip); + XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); if (error != 0) { return error; } ASSERT(ip != NULL); - vp = XFS_ITOV(ip); ip->i_d.di_mode = (__uint16_t)mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); - ip->i_d.di_uid = current_fsuid(cr); - ip->i_d.di_gid = current_fsgid(cr); + ip->i_d.di_uid = current_fsuid(); + ip->i_d.di_gid = current_fsgid(); ip->i_d.di_projid = prid; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); @@ -1135,7 +1092,7 @@ xfs_ialloc( * the inode version number now. This way we only do the conversion * here rather than here and in the flush/logging code. */ - if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) && + if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && ip->i_d.di_version == XFS_DINODE_VERSION_1) { ip->i_d.di_version = XFS_DINODE_VERSION_2; /* @@ -1147,10 +1104,10 @@ xfs_ialloc( /* * Project ids won't be stored on disk if we are using a version 1 inode. */ - if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) + if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) xfs_bump_ino_vers2(tp, ip); - if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { + if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { ip->i_d.di_mode |= S_ISGID; @@ -1169,9 +1126,16 @@ xfs_ialloc( } ip->i_d.di_size = 0; + ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); - xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); + + nanotime(&tv); + ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_atime = ip->i_d.di_mtime; + ip->i_d.di_ctime = ip->i_d.di_mtime; + /* * di_gen will have been taken care of in xfs_iread. */ @@ -1191,8 +1155,16 @@ xfs_ialloc( flags |= XFS_ILOG_DEV; break; case S_IFREG: + if (pip && xfs_inode_is_filestream(pip)) { + error = xfs_filestream_associate(pip, ip); + if (error < 0) + return -error; + if (!error) + xfs_iflags_set(ip, XFS_IFILESTREAM); + } + /* fall through */ case S_IFDIR: - if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; if ((mode & S_IFMT) == S_IFDIR) { @@ -1203,10 +1175,8 @@ xfs_ialloc( ip->i_d.di_extsize = pip->i_d.di_extsize; } } else if ((mode & S_IFMT) == S_IFREG) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; - ip->i_iocore.io_flags |= XFS_IOCORE_RT; - } if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSIZE; ip->i_d.di_extsize = pip->i_d.di_extsize; @@ -1229,6 +1199,8 @@ xfs_ialloc( if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; ip->i_d.di_flags |= di_flags; } /* FALLTHROUGH */ @@ -1253,7 +1225,7 @@ xfs_ialloc( xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup inode ops and unlock */ - bhv_vfs_init_vnode(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1); + xfs_setup_inode(ip); *ipp = ip; return 0; @@ -1279,7 +1251,10 @@ xfs_isize_check( if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) return; - if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE)) + if (XFS_IS_REALTIME_INODE(ip)) + return; + + if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) return; nimaps = 2; @@ -1321,7 +1296,7 @@ xfs_file_last_byte( xfs_fileoff_t size_last_block; int error; - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); mp = ip->i_mount; /* @@ -1339,7 +1314,7 @@ xfs_file_last_byte( } else { last_block = 0; } - size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size); + size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size); last_block = XFS_FILEOFF_MAX(last_block, size_last_block); last_byte = XFS_FSB_TO_B(mp, last_block); @@ -1420,7 +1395,7 @@ xfs_itrunc_trace( * must be called again with all the same restrictions as the initial * call. */ -void +int xfs_itruncate_start( xfs_inode_t *ip, uint flags, @@ -1429,18 +1404,19 @@ xfs_itruncate_start( xfs_fsize_t last_byte; xfs_off_t toss_start; xfs_mount_t *mp; - bhv_vnode_t *vp; + int error = 0; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT((new_size == 0) || (new_size <= ip->i_size)); ASSERT((flags == XFS_ITRUNC_DEFINITE) || (flags == XFS_ITRUNC_MAYBE)); mp = ip->i_mount; - vp = XFS_ITOV(ip); - vn_iowait(vp); /* wait for the completion of any pending DIOs */ - + /* wait for the completion of any pending DIOs */ + if (new_size == 0 || new_size < ip->i_size) + vn_iowait(ip); + /* * Call toss_pages or flushinval_pages to get rid of pages * overlapping the region being removed. We have to use @@ -1467,72 +1443,74 @@ xfs_itruncate_start( * file size, so there is no way that the data extended * out there. */ - return; + return 0; } last_byte = xfs_file_last_byte(ip); xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, last_byte); if (last_byte > toss_start) { if (flags & XFS_ITRUNC_DEFINITE) { - bhv_vop_toss_pages(vp, toss_start, -1, FI_REMAPF_LOCKED); + xfs_tosspages(ip, toss_start, + -1, FI_REMAPF_LOCKED); } else { - bhv_vop_flushinval_pages(vp, toss_start, -1, FI_REMAPF_LOCKED); + error = xfs_flushinval_pages(ip, toss_start, + -1, FI_REMAPF_LOCKED); } } #ifdef DEBUG if (new_size == 0) { - ASSERT(VN_CACHED(vp) == 0); + ASSERT(VN_CACHED(VFS_I(ip)) == 0); } #endif + return error; } /* - * Shrink the file to the given new_size. The new - * size must be smaller than the current size. - * This will free up the underlying blocks - * in the removed range after a call to xfs_itruncate_start() - * or xfs_atruncate_start(). + * Shrink the file to the given new_size. The new size must be smaller than + * the current size. This will free up the underlying blocks in the removed + * range after a call to xfs_itruncate_start() or xfs_atruncate_start(). + * + * The transaction passed to this routine must have made a permanent log + * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the + * given transaction and start new ones, so make sure everything involved in + * the transaction is tidy before calling here. Some transaction will be + * returned to the caller to be committed. The incoming transaction must + * already include the inode, and both inode locks must be held exclusively. + * The inode must also be "held" within the transaction. On return the inode + * will be "held" within the returned transaction. This routine does NOT + * require any disk space to be reserved for it within the transaction. * - * The transaction passed to this routine must have made - * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. - * This routine may commit the given transaction and - * start new ones, so make sure everything involved in - * the transaction is tidy before calling here. - * Some transaction will be returned to the caller to be - * committed. The incoming transaction must already include - * the inode, and both inode locks must be held exclusively. - * The inode must also be "held" within the transaction. On - * return the inode will be "held" within the returned transaction. - * This routine does NOT require any disk space to be reserved - * for it within the transaction. + * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it + * indicates the fork which is to be truncated. For the attribute fork we only + * support truncation to size 0. * - * The fork parameter must be either xfs_attr_fork or xfs_data_fork, - * and it indicates the fork which is to be truncated. For the - * attribute fork we only support truncation to size 0. + * We use the sync parameter to indicate whether or not the first transaction + * we perform might have to be synchronous. For the attr fork, it needs to be + * so if the unlink of the inode is not yet known to be permanent in the log. + * This keeps us from freeing and reusing the blocks of the attribute fork + * before the unlink of the inode becomes permanent. * - * We use the sync parameter to indicate whether or not the first - * transaction we perform might have to be synchronous. For the attr fork, - * it needs to be so if the unlink of the inode is not yet known to be - * permanent in the log. This keeps us from freeing and reusing the - * blocks of the attribute fork before the unlink of the inode becomes - * permanent. + * For the data fork, we normally have to run synchronously if we're being + * called out of the inactive path or we're being called out of the create path + * where we're truncating an existing file. Either way, the truncate needs to + * be sync so blocks don't reappear in the file with altered data in case of a + * crash. wsync filesystems can run the first case async because anything that + * shrinks the inode has to run sync so by the time we're called here from + * inactive, the inode size is permanently set to 0. * - * For the data fork, we normally have to run synchronously if we're - * being called out of the inactive path or we're being called - * out of the create path where we're truncating an existing file. - * Either way, the truncate needs to be sync so blocks don't reappear - * in the file with altered data in case of a crash. wsync filesystems - * can run the first case async because anything that shrinks the inode - * has to run sync so by the time we're called here from inactive, the - * inode size is permanently set to 0. + * Calls from the truncate path always need to be sync unless we're in a wsync + * filesystem and the file has already been unlinked. * - * Calls from the truncate path always need to be sync unless we're - * in a wsync filesystem and the file has already been unlinked. + * The caller is responsible for correctly setting the sync parameter. It gets + * too hard for us to guess here which path we're being called out of just + * based on inode state. * - * The caller is responsible for correctly setting the sync parameter. - * It gets too hard for us to guess here which path we're being called - * out of just based on inode state. + * If we get an error, we must return with the inode locked and linked into the + * current transaction. This keeps things simple for the higher level code, + * because it always knows that the inode is locked and held in the transaction + * that returns to it whether errors occur or not. We don't mark the inode + * dirty on error so that transactions can be easily aborted if possible. */ int xfs_itruncate_finish( @@ -1553,9 +1531,8 @@ xfs_itruncate_finish( xfs_bmap_free_t free_list; int error; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT((new_size == 0) || (new_size <= ip->i_size)); ASSERT(*tp != NULL); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_transp == *tp); @@ -1629,8 +1606,20 @@ xfs_itruncate_finish( */ if (fork == XFS_DATA_FORK) { if (ip->i_d.di_nextents > 0) { - ip->i_d.di_size = new_size; - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + /* + * If we are not changing the file size then do + * not update the on-disk file size - we may be + * called from xfs_inactive_free_eofblocks(). If we + * update the on-disk file size and then the system + * crashes before the contents of the file are + * flushed to disk then the files may be full of + * holes (ie NULL files bug). + */ + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + } } } else if (sync) { ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); @@ -1674,7 +1663,7 @@ xfs_itruncate_finish( * runs. */ XFS_BMAP_INIT(&free_list, &first_block); - error = XFS_BUNMAPI(mp, ntp, &ip->i_iocore, + error = xfs_bunmapi(ntp, ip, first_unmap_block, unmap_len, XFS_BMAPI_AFLAG(fork) | (sync ? 0 : XFS_BMAPI_ASYNC), @@ -1697,68 +1686,53 @@ xfs_itruncate_finish( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(tp, &free_list, first_block, - &committed); + error = xfs_bmap_finish(tp, &free_list, &committed); ntp = *tp; + if (committed) { + /* link the inode into the next xact in the chain */ + xfs_trans_ijoin(ntp, ip, + XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(ntp, ip); + } + if (error) { /* - * If the bmap finish call encounters an error, - * return to the caller where the transaction - * can be properly aborted. We just need to - * make sure we're not holding any resources - * that we were not when we came in. + * If the bmap finish call encounters an error, return + * to the caller where the transaction can be properly + * aborted. We just need to make sure we're not + * holding any resources that we were not when we came + * in. * - * Aborting from this point might lose some - * blocks in the file system, but oh well. + * Aborting from this point might lose some blocks in + * the file system, but oh well. */ xfs_bmap_cancel(&free_list); - if (committed) { - /* - * If the passed in transaction committed - * in xfs_bmap_finish(), then we want to - * add the inode to this one before returning. - * This keeps things simple for the higher - * level code, because it always knows that - * the inode is locked and held in the - * transaction that returns to it whether - * errors occur or not. We don't mark the - * inode dirty so that this transaction can - * be easily aborted if possible. - */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); - } return error; } if (committed) { /* - * The first xact was committed, - * so add the inode to the new one. - * Mark it dirty so it will be logged - * and moved forward in the log as - * part of every commit. + * Mark the inode dirty so it will be logged and + * moved forward in the log as part of every commit. */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); } + ntp = xfs_trans_dup(ntp); - (void) xfs_trans_commit(*tp, 0, NULL); + error = xfs_trans_commit(*tp, 0); *tp = ntp; - error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - /* - * Add the inode being truncated to the next chained - * transaction. - */ + + /* link the inode into the next transaction in the chain */ xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ihold(ntp, ip); + + if (!error) + error = xfs_trans_reserve(ntp, 0, + XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); if (error) - return (error); + return error; } /* * Only update the size in the case of the data fork, but @@ -1767,7 +1741,19 @@ xfs_itruncate_finish( */ if (fork == XFS_DATA_FORK) { xfs_isize_check(mp, ip, new_size); - ip->i_d.di_size = new_size; + /* + * If we are not changing the file size then do + * not update the on-disk file size - we may be + * called from xfs_inactive_free_eofblocks(). If we + * update the on-disk file size and then the system + * crashes before the contents of the file are + * flushed to disk then the files may be full of + * holes (ie NULL files bug). + */ + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + } } xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); ASSERT((new_size != 0) || @@ -1780,72 +1766,6 @@ xfs_itruncate_finish( return 0; } - -/* - * xfs_igrow_start - * - * Do the first part of growing a file: zero any data in the last - * block that is beyond the old EOF. We need to do this before - * the inode is joined to the transaction to modify the i_size. - * That way we can drop the inode lock and call into the buffer - * cache to get the buffer mapping the EOF. - */ -int -xfs_igrow_start( - xfs_inode_t *ip, - xfs_fsize_t new_size, - cred_t *credp) -{ - int error; - - ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); - ASSERT(new_size > ip->i_d.di_size); - - /* - * Zero any pages that may have been created by - * xfs_write_file() beyond the end of the file - * and any blocks between the old and new file sizes. - */ - error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, - ip->i_d.di_size, new_size); - return error; -} - -/* - * xfs_igrow_finish - * - * This routine is called to extend the size of a file. - * The inode must have both the iolock and the ilock locked - * for update and it must be a part of the current transaction. - * The xfs_igrow_start() function must have been called previously. - * If the change_flag is not zero, the inode change timestamp will - * be updated. - */ -void -xfs_igrow_finish( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_fsize_t new_size, - int change_flag) -{ - ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); - ASSERT(ip->i_transp == tp); - ASSERT(new_size > ip->i_d.di_size); - - /* - * Update the file size. Update the inode change timestamp - * if change_flag set. - */ - ip->i_d.di_size = new_size; - if (change_flag) - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - -} - - /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1884,9 +1804,9 @@ xfs_iunlink( */ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, XFS_FSS_TO_BB(mp, 1), 0, &agibp); - if (error) { + if (error) return error; - } + /* * Validate the magic number of the agi block. */ @@ -1917,12 +1837,11 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); - if (error) { + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); + if (error) return error; - } - ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO); - ASSERT(dip->di_next_unlinked); + + ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); /* both on-disk, don't endian flip twice */ dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_boffset + @@ -2026,17 +1945,17 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", error, mp->m_fsname); return error; } - next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); + next_agino = be32_to_cpu(dip->di_next_unlinked); ASSERT(next_agino != 0); if (next_agino != NULLAGINO) { - INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_boffset + offsetof(xfs_dinode_t, di_next_unlinked); xfs_trans_inode_buf(tp, ibp); @@ -2080,7 +1999,7 @@ xfs_iunlink_remove( error, mp->m_fsname); return error; } - next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT); + next_agino = be32_to_cpu(last_dip->di_next_unlinked); ASSERT(next_agino != NULLAGINO); ASSERT(next_agino != 0); } @@ -2088,18 +2007,18 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", error, mp->m_fsname); return error; } - next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); + next_agino = be32_to_cpu(dip->di_next_unlinked); ASSERT(next_agino != 0); ASSERT(next_agino != agino); if (next_agino != NULLAGINO) { - INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_boffset + offsetof(xfs_dinode_t, di_next_unlinked); xfs_trans_inode_buf(tp, ibp); @@ -2112,7 +2031,7 @@ xfs_iunlink_remove( /* * Point the previous inode on the list to the next inode. */ - INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino); + last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); xfs_trans_inode_buf(tp, last_ibp); @@ -2123,13 +2042,6 @@ xfs_iunlink_remove( return 0; } -static __inline__ int xfs_inode_clean(xfs_inode_t *ip) -{ - return (((ip->i_itemp == NULL) || - !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && - (ip->i_update_core == 0)); -} - STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -2143,11 +2055,10 @@ xfs_ifree_cluster( int i, j, found, pre_flushed; xfs_daddr_t blkno; xfs_buf_t *bp; - xfs_ihash_t *ih; xfs_inode_t *ip, **ip_found; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; - SPLDECL(s); + xfs_perag_t *pag = xfs_get_perag(mp, inum); if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { blks_per_cluster = 1; @@ -2181,23 +2092,20 @@ xfs_ifree_cluster( */ found = 0; for (i = 0; i < ninodes; i++) { - ih = XFS_IHASH(mp, inum + i); - read_lock(&ih->ih_lock); - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == inum + i) - break; - } + read_lock(&pag->pag_ici_lock); + ip = radix_tree_lookup(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, (inum + i))); /* Inode not in memory or we found it already, * nothing to do */ - if (!ip || (ip->i_flags & XFS_ISTALE)) { - read_unlock(&ih->ih_lock); + if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { + read_unlock(&pag->pag_ici_lock); continue; } if (xfs_inode_clean(ip)) { - read_unlock(&ih->ih_lock); + read_unlock(&pag->pag_ici_lock); continue; } @@ -2213,21 +2121,20 @@ xfs_ifree_cluster( if (ip == free_ip) { if (xfs_iflock_nowait(ip)) { - ip->i_flags |= XFS_ISTALE; - + xfs_iflags_set(ip, XFS_ISTALE); if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); } else { ip_found[found++] = ip; } } - read_unlock(&ih->ih_lock); + read_unlock(&pag->pag_ici_lock); continue; } if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { if (xfs_iflock_nowait(ip)) { - ip->i_flags |= XFS_ISTALE; + xfs_iflags_set(ip, XFS_ISTALE); if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); @@ -2239,8 +2146,7 @@ xfs_ifree_cluster( xfs_iunlock(ip, XFS_ILOCK_EXCL); } } - - read_unlock(&ih->ih_lock); + read_unlock(&pag->pag_ici_lock); } bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, @@ -2254,10 +2160,10 @@ xfs_ifree_cluster( iip = (xfs_inode_log_item_t *)lip; ASSERT(iip->ili_logged == 1); lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); iip->ili_flush_lsn = iip->ili_item.li_lsn; - AIL_UNLOCK(mp, s); - iip->ili_inode->i_flags |= XFS_ISTALE; + spin_unlock(&mp->m_ail_lock); + xfs_iflags_set(iip->ili_inode, XFS_ISTALE); pre_flushed++; } lip = lip->li_bio_list; @@ -2277,9 +2183,9 @@ xfs_ifree_cluster( iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; iip->ili_logged = 1; - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); iip->ili_flush_lsn = iip->ili_item.li_lsn; - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) @@ -2294,7 +2200,8 @@ xfs_ifree_cluster( xfs_trans_binval(tp, bp); } - kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); + kmem_free(ip_found); + xfs_put_perag(mp, pag); } /* @@ -2316,13 +2223,15 @@ xfs_ifree( int error; int delete; xfs_ino_t first_ino; + xfs_dinode_t *dip; + xfs_buf_t *ibp; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_transp == tp); ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT((ip->i_d.di_size == 0) || + ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); ASSERT(ip->i_d.di_nblocks == 0); @@ -2351,8 +2260,27 @@ xfs_ifree( * by reincarnations of this inode. */ ip->i_d.di_gen++; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); + if (error) + return error; + + /* + * Clear the on-disk di_mode. This is to prevent xfs_bulkstat + * from picking up this inode when it is reclaimed (its incore state + * initialzed but not flushed to disk yet). The in-core di_mode is + * already cleared and a corresponding transaction logged. + * The hack here just synchronizes the in-core to on-disk + * di_mode value in advance before the actual inode sync to disk. + * This is OK because the inode is already unlinked and would never + * change its di_mode again for this inode generation. + * This is a temporary hack that would require a proper fix + * in the future. + */ + dip->di_core.di_mode = 0; + if (delete) { xfs_ifree_cluster(ip, tp, first_ino); } @@ -2484,7 +2412,7 @@ xfs_iroot_realloc( (int)new_size); memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); } - kmem_free(ifp->if_broot, ifp->if_broot_bytes); + kmem_free(ifp->if_broot); ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= @@ -2528,7 +2456,7 @@ xfs_idata_realloc( if (new_size == 0) { if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_data); } ifp->if_u1.if_data = NULL; real_size = 0; @@ -2543,7 +2471,7 @@ xfs_idata_realloc( ASSERT(ifp->if_real_bytes != 0); memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, new_size); - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = ifp->if_u2.if_inline_data; } real_size = 0; @@ -2613,14 +2541,31 @@ xfs_imap( fsbno = imap->im_blkno ? XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); - if (error != 0) { + if (error) return error; - } + imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); imap->im_len = XFS_FSB_TO_BB(mp, len); imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); imap->im_ioffset = (ushort)off; imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " + "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " + " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", + (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return EINVAL; + } return 0; } @@ -2633,7 +2578,7 @@ xfs_idestroy_fork( ifp = XFS_IFORK_PTR(ip, whichfork); if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot, ifp->if_broot_bytes); + kmem_free(ifp->if_broot); ifp->if_broot = NULL; } @@ -2647,7 +2592,7 @@ xfs_idestroy_fork( if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && (ifp->if_u1.if_data != NULL)) { ASSERT(ifp->if_real_bytes != 0); - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = NULL; ifp->if_real_bytes = 0; } @@ -2677,7 +2622,6 @@ void xfs_idestroy( xfs_inode_t *ip) { - switch (ip->i_d.di_mode & S_IFMT) { case S_IFREG: case S_IFDIR: @@ -2689,7 +2633,10 @@ xfs_idestroy( xfs_idestroy_fork(ip, XFS_ATTR_FORK); mrfree(&ip->i_lock); mrfree(&ip->i_iolock); - freesema(&ip->i_flock); + +#ifdef XFS_INODE_TRACE + ktrace_free(ip->i_trace); +#endif #ifdef XFS_BMAP_TRACE ktrace_free(ip->i_xtrace); #endif @@ -2706,10 +2653,23 @@ xfs_idestroy( ktrace_free(ip->i_dir_trace); #endif if (ip->i_itemp) { - /* XXXdpd should be able to assert this but shutdown - * is leaving the AIL behind. */ - ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); + /* + * Only if we are shutting down the fs will we see an + * inode still in the AIL. If it is there, we should remove + * it to prevent a use-after-free from occurring. + */ + xfs_mount_t *mp = ip->i_mount; + xfs_log_item_t *lip = &ip->i_itemp->ili_item; + + ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&mp->m_ail_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_delete_ail(mp, lip); + else + spin_unlock(&mp->m_ail_lock); + } xfs_inode_item_destroy(ip); } kmem_zone_free(xfs_inode_zone, ip); @@ -2724,7 +2684,7 @@ void xfs_ipin( xfs_inode_t *ip) { - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); atomic_inc(&ip->i_pincount); } @@ -2740,68 +2700,46 @@ xfs_iunpin( { ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) { - /* - * If the inode is currently being reclaimed, the - * linux inode _and_ the xfs vnode may have been - * freed so we cannot reference either of them safely. - * Hence we should not try to do anything to them - * if the xfs inode is currently in the reclaim - * path. - * - * However, we still need to issue the unpin wakeup - * call as the inode reclaim may be blocked waiting for - * the inode to become unpinned. - */ - if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { - bhv_vnode_t *vp = XFS_ITOV_NULL(ip); - - /* make sync come back and flush this inode */ - if (vp) { - struct inode *inode = vn_to_inode(vp); - - if (!(inode->i_state & - (I_NEW|I_FREEING|I_CLEAR))) - mark_inode_dirty_sync(inode); - } - } + if (atomic_dec_and_test(&ip->i_pincount)) wake_up(&ip->i_ipin_wait); - } } /* - * This is called to wait for the given inode to be unpinned. - * It will sleep until this happens. The caller must have the - * inode locked in at least shared mode so that the buffer cannot - * be subsequently pinned once someone is waiting for it to be - * unpinned. + * This is called to unpin an inode. It can be directed to wait or to return + * immediately without waiting for the inode to be unpinned. The caller must + * have the inode locked in at least shared mode so that the buffer cannot be + * subsequently pinned once someone is waiting for it to be unpinned. */ STATIC void -xfs_iunpin_wait( - xfs_inode_t *ip) +__xfs_iunpin_wait( + xfs_inode_t *ip, + int wait) { - xfs_inode_log_item_t *iip; - xfs_lsn_t lsn; + xfs_inode_log_item_t *iip = ip->i_itemp; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); - - if (atomic_read(&ip->i_pincount) == 0) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + if (atomic_read(&ip->i_pincount) == 0) return; - } - iip = ip->i_itemp; - if (iip && iip->ili_last_lsn) { - lsn = iip->ili_last_lsn; - } else { - lsn = (xfs_lsn_t)0; - } + /* Give the log a push to start the unpinning I/O */ + xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? + iip->ili_last_lsn : 0, XFS_LOG_FORCE); + if (wait) + wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); +} - /* - * Give the log a push so we don't wait here too long. - */ - xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); +static inline void +xfs_iunpin_wait( + xfs_inode_t *ip) +{ + __xfs_iunpin_wait(ip, 1); +} - wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); +static inline void +xfs_iunpin_nowait( + xfs_inode_t *ip) +{ + __xfs_iunpin_wait(ip, 0); } @@ -2819,26 +2757,21 @@ xfs_iunpin_wait( int xfs_iextents_copy( xfs_inode_t *ip, - xfs_bmbt_rec_t *buffer, + xfs_bmbt_rec_t *dp, int whichfork) { int copied; - xfs_bmbt_rec_t *dest_ep; - xfs_bmbt_rec_t *ep; -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_iextents_copy"; -#endif int i; xfs_ifork_t *ifp; int nrecs; xfs_fsblock_t start_block; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); ASSERT(nrecs > 0); /* @@ -2847,10 +2780,9 @@ xfs_iextents_copy( * the delayed ones. There must be at least one * non-delayed extent. */ - dest_ep = buffer; copied = 0; for (i = 0; i < nrecs; i++) { - ep = xfs_iext_get_ext(ifp, i); + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); start_block = xfs_bmbt_get_startblock(ep); if (ISNULLSTARTBLOCK(start_block)) { /* @@ -2860,15 +2792,13 @@ xfs_iextents_copy( } /* Translate to on disk format */ - put_unaligned(INT_GET(ep->l0, ARCH_CONVERT), - (__uint64_t*)&dest_ep->l0); - put_unaligned(INT_GET(ep->l1, ARCH_CONVERT), - (__uint64_t*)&dest_ep->l1); - dest_ep++; + put_unaligned(cpu_to_be64(ep->l0), &dp->l0); + put_unaligned(cpu_to_be64(ep->l1), &dp->l1); + dp++; copied++; } ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, 1, XFS_EXTFMT_INODE(ip)); + xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); return (copied * (uint)sizeof(xfs_bmbt_rec_t)); } @@ -2884,7 +2814,7 @@ xfs_iextents_copy( * format indicates the current state of the fork. */ /*ARGSUSED*/ -STATIC int +STATIC void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -2905,16 +2835,16 @@ xfs_iflush_fork( static const short extflag[2] = { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; - if (iip == NULL) - return 0; + if (!iip) + return; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, * for the attribute fork. */ - if (ifp == NULL) { + if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return 0; + return; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; @@ -2959,7 +2889,7 @@ xfs_iflush_fork( case XFS_DINODE_FMT_DEV: if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { ASSERT(whichfork == XFS_DATA_FORK); - INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev); + dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev); } break; @@ -2975,17 +2905,155 @@ xfs_iflush_fork( ASSERT(0); break; } +} + +STATIC int +xfs_iflush_cluster( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + unsigned long first_index, mask; + unsigned long inodes_per_cluster; + int ilist_size; + xfs_inode_t **ilist; + xfs_inode_t *iq; + int nr_found; + int clcount = 0; + int bufwasdelwri; + int i; + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pag_ici_init); + + inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); + if (!ilist) + return 0; + + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + read_lock(&pag->pag_ici_lock); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + first_index, inodes_per_cluster); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + iq = ilist[i]; + if (iq == ip) + continue; + /* if the inode lies outside this cluster, we're done. */ + if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) + break; + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + continue; + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(iq)) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(iq)) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(iq)) { + int error; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + clcount++; + } else { + xfs_ifunlock(iq); + } + xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(xs_icluster_flushcnt); + XFS_STATS_ADD(xs_icluster_flushinode, clcount); + } + +out_free: + read_unlock(&pag->pag_ici_lock); + kmem_free(ilist); return 0; + + +cluster_corrupt_out: + /* + * Corruption detected in the clustering loop. Invalidate the + * inode buffer and shut down the filesystem. + */ + read_unlock(&pag->pag_ici_lock); + /* + * Clean up the buffer. If it was B_DELWRI, just release it -- + * brelse can handle it with no problems. If not, shut down the + * filesystem before releasing the buffer. + */ + bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + if (bufwasdelwri) + xfs_buf_relse(bp); + + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + + if (!bufwasdelwri) { + /* + * Just like incore_relse: if we have b_iodone functions, + * mark the buffer as an error and call them. Otherwise + * mark it as stale and brelse. + */ + if (XFS_BUF_IODONE_FUNC(bp)) { + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_SHUT(bp); + XFS_BUF_ERROR(bp,EIO); + xfs_biodone(bp); + } else { + XFS_BUF_STALE(bp); + xfs_buf_relse(bp); + } + } + + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(iq); + kmem_free(ilist); + return XFS_ERROR(EFSCORRUPTED); } /* * xfs_iflush() will write a modified inode's changes out to the * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush semaphore must be - * held as well. The inode lock will still be held upon return from + * in at least shared mode and the inode flush completion must be + * active as well. The inode lock will still be held upon return from * the call and the caller is free to unlock it. - * The inode flush lock will be unlocked when the inode reaches the disk. + * The inode flush will be completed when the inode reaches the disk. * The flags indicate how the inode's buffer should be written out. */ int @@ -2998,18 +3066,13 @@ xfs_iflush( xfs_dinode_t *dip; xfs_mount_t *mp; int error; - /* REFERENCED */ - xfs_chash_t *ch; - xfs_inode_t *iq; - int clcount; /* count of inodes clustered */ - int bufwasdelwri; + int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; - SPLDECL(s); XFS_STATS_INC(xs_iflush_count); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); - ASSERT(issemalocked(&(ip->i_flock))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(!completion_done(&ip->i_flush)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > ip->i_df.if_ext_max); @@ -3020,20 +3083,27 @@ xfs_iflush( * If the inode isn't dirty, then just release the inode * flush lock and do nothing. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { - ASSERT((iip != NULL) ? - !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); + if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); return 0; } /* - * We can't flush the inode until it is unpinned, so - * wait for it. We know noone new can pin it, because - * we are holding the inode lock shared and you need - * to hold it exclusively to pin the inode. + * We can't flush the inode until it is unpinned, so wait for it if we + * are allowed to block. We know noone new can pin it, because we are + * holding the inode lock shared and you need to hold it exclusively to + * pin the inode. + * + * If we are not allowed to block, force the log out asynchronously so + * that when we come back the inode will be unpinned. If other inodes + * in the same cluster are dirty, they will probably write the inode + * out for us if they occur after the log force completes. */ + if (noblock && xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + xfs_ifunlock(ip); + return EAGAIN; + } xfs_iunpin_wait(ip); /* @@ -3050,15 +3120,6 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. - */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0); - if (error) { - xfs_ifunlock(ip); - return error; - } - - /* * Decide how buffer will be flushed out. This is done before * the call to xfs_iflush_int because this field is zeroed by it. */ @@ -3074,6 +3135,7 @@ xfs_iflush( case XFS_IFLUSH_DELWRI_ELSE_SYNC: flags = 0; break; + case XFS_IFLUSH_ASYNC_NOBLOCK: case XFS_IFLUSH_ASYNC: case XFS_IFLUSH_DELWRI_ELSE_ASYNC: flags = INT_ASYNC; @@ -3093,6 +3155,7 @@ xfs_iflush( case XFS_IFLUSH_DELWRI: flags = INT_DELWRI; break; + case XFS_IFLUSH_ASYNC_NOBLOCK: case XFS_IFLUSH_ASYNC: flags = INT_ASYNC; break; @@ -3107,94 +3170,41 @@ xfs_iflush( } /* - * First flush out the inode that xfs_iflush was called with. + * Get the buffer containing the on-disk inode. */ - error = xfs_iflush_int(ip, bp); - if (error) { - goto corrupt_out; + error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0, + noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); + if (error || !bp) { + xfs_ifunlock(ip); + return error; } /* - * inode clustering: - * see if other inodes can be gathered into this write + * First flush out the inode that xfs_iflush was called with. */ - - ip->i_chash->chl_buf = bp; - - ch = XFS_CHASH(mp, ip->i_blkno); - s = mutex_spinlock(&ch->ch_lock); - - clcount = 0; - for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) { - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - iip = iq->i_itemp; - if ((iq->i_update_core == 0) && - ((iip == NULL) || - !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && - xfs_ipincount(iq) == 0) { - continue; - } - - /* - * Try to get locks. If any are unavailable, - * then this inode cannot be flushed and is skipped. - */ - - /* get inode locks (just i_lock) */ - if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { - /* get inode flush lock */ - if (xfs_iflock_nowait(iq)) { - /* check if pinned */ - if (xfs_ipincount(iq) == 0) { - /* arriving here means that - * this inode can be flushed. - * first re-check that it's - * dirty - */ - iip = iq->i_itemp; - if ((iq->i_update_core != 0)|| - ((iip != NULL) && - (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { - clcount++; - error = xfs_iflush_int(iq, bp); - if (error) { - xfs_iunlock(iq, - XFS_ILOCK_SHARED); - goto cluster_corrupt_out; - } - } else { - xfs_ifunlock(iq); - } - } else { - xfs_ifunlock(iq); - } - } - xfs_iunlock(iq, XFS_ILOCK_SHARED); - } - } - mutex_spinunlock(&ch->ch_lock, s); - - if (clcount) { - XFS_STATS_INC(xs_icluster_flushcnt); - XFS_STATS_ADD(xs_icluster_flushinode, clcount); - } + error = xfs_iflush_int(ip, bp); + if (error) + goto corrupt_out; /* - * If the buffer is pinned then push on the log so we won't + * If the buffer is pinned then push on the log now so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)){ + if (XFS_BUF_ISPINNED(bp)) xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - } + + /* + * inode clustering: + * see if other inodes can be gathered into this write + */ + error = xfs_iflush_cluster(ip, bp); + if (error) + goto cluster_corrupt_out; if (flags & INT_DELWRI) { xfs_bdwrite(mp, bp); } else if (flags & INT_ASYNC) { - xfs_bawrite(mp, bp); + error = xfs_bawrite(mp, bp); } else { error = xfs_bwrite(mp, bp); } @@ -3203,52 +3213,11 @@ xfs_iflush( corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - xfs_iflush_abort(ip); - /* - * Unlocks the flush lock - */ - return XFS_ERROR(EFSCORRUPTED); - cluster_corrupt_out: - /* Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - mutex_spinunlock(&ch->ch_lock, s); - - /* - * Clean up the buffer. If it was B_DELWRI, just release it -- - * brelse can handle it with no problems. If not, shut down the - * filesystem before releasing the buffer. - */ - if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { - xfs_buf_relse(bp); - } - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - if(!bufwasdelwri) { - /* - * Just like incore_relse: if we have b_iodone functions, - * mark the buffer as an error and call them. Otherwise - * mark it as stale and brelse. - */ - if (XFS_BUF_IODONE_FUNC(bp)) { - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_SHUT(bp); - XFS_BUF_ERROR(bp,EIO); - xfs_biodone(bp); - } else { - XFS_BUF_STALE(bp); - xfs_buf_relse(bp); - } - } - - xfs_iflush_abort(iq); /* * Unlocks the flush lock */ + xfs_iflush_abort(ip); return XFS_ERROR(EFSCORRUPTED); } @@ -3264,10 +3233,9 @@ xfs_iflush_int( #ifdef XFS_TRANS_DEBUG int first; #endif - SPLDECL(s); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); - ASSERT(issemalocked(&(ip->i_flock))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(!completion_done(&ip->i_flush)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > ip->i_df.if_ext_max); @@ -3279,8 +3247,7 @@ xfs_iflush_int( * If the inode isn't dirty, then just release the inode * flush lock and do nothing. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); return 0; } @@ -3308,11 +3275,11 @@ xfs_iflush_int( */ xfs_synchronize_atime(ip); - if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, + if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", - ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip); + ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip); goto corrupt_out; } if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, @@ -3375,7 +3342,7 @@ xfs_iflush_int( * because if the inode is dirty at all the core must * be. */ - xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1); + xfs_dinode_to_disk(&dip->di_core, &ip->i_d); /* Wrap, we never let the log put out DI_MAX_FLUSH */ if (ip->i_d.di_flushiter == DI_MAX_FLUSH) @@ -3388,14 +3355,14 @@ xfs_iflush_int( * has been updated, then make the conversion permanent. */ ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || - XFS_SB_VERSION_HASNLINK(&mp->m_sb)); + xfs_sb_version_hasnlink(&mp->m_sb)); if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { /* * Convert it back. */ ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink); + dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink); } else { /* * The superblock version has already been bumped, @@ -3403,7 +3370,7 @@ xfs_iflush_int( * format permanent. */ ip->i_d.di_version = XFS_DINODE_VERSION_2; - INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2); + dip->di_core.di_version = XFS_DINODE_VERSION_2; ip->i_d.di_onlink = 0; dip->di_core.di_onlink = 0; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); @@ -3413,16 +3380,9 @@ xfs_iflush_int( } } - if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { - goto corrupt_out; - } - - if (XFS_IFORK_Q(ip)) { - /* - * The only error from xfs_iflush_fork is on the data fork. - */ - (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); - } + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); + if (XFS_IFORK_Q(ip)) + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); xfs_inobp_check(mp, bp); /* @@ -3459,9 +3419,9 @@ xfs_iflush_int( iip->ili_logged = 1; ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); iip->ili_flush_lsn = iip->ili_item.li_lsn; - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); /* * Attach the function xfs_iflush_done to the inode's @@ -3507,7 +3467,6 @@ xfs_iflush_all( xfs_mount_t *mp) { xfs_inode_t *ip; - bhv_vnode_t *vp; again: XFS_MOUNT_ILOCK(mp); @@ -3522,14 +3481,13 @@ xfs_iflush_all( continue; } - vp = XFS_ITOV_NULL(ip); - if (!vp) { + if (!VFS_I(ip)) { XFS_MOUNT_IUNLOCK(mp); xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); goto again; } - ASSERT(vn_count(vp) == 0); + ASSERT(vn_count(VFS_I(ip)) == 0); ip = ip->i_mnext; } while (ip != mp->m_inodes); @@ -3537,95 +3495,6 @@ xfs_iflush_all( XFS_MOUNT_IUNLOCK(mp); } -/* - * xfs_iaccess: check accessibility of inode for mode. - */ -int -xfs_iaccess( - xfs_inode_t *ip, - mode_t mode, - cred_t *cr) -{ - int error; - mode_t orgmode = mode; - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); - - if (mode & S_IWUSR) { - umode_t imode = inode->i_mode; - - if (IS_RDONLY(inode) && - (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode))) - return XFS_ERROR(EROFS); - - if (IS_IMMUTABLE(inode)) - return XFS_ERROR(EACCES); - } - - /* - * If there's an Access Control List it's used instead of - * the mode bits. - */ - if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1) - return error ? XFS_ERROR(error) : 0; - - if (current_fsuid(cr) != ip->i_d.di_uid) { - mode >>= 3; - if (!in_group_p((gid_t)ip->i_d.di_gid)) - mode >>= 3; - } - - /* - * If the DACs are ok we don't need any capability check. - */ - if ((ip->i_d.di_mode & mode) == mode) - return 0; - /* - * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. - */ - if (!(orgmode & S_IXUSR) || - (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) - if (capable_cred(cr, CAP_DAC_OVERRIDE)) - return 0; - - if ((orgmode == S_IRUSR) || - (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) { - if (capable_cred(cr, CAP_DAC_READ_SEARCH)) - return 0; -#ifdef NOISE - cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode); -#endif /* NOISE */ - return XFS_ERROR(EACCES); - } - return XFS_ERROR(EACCES); -} - -/* - * xfs_iroundup: round up argument to next power of two - */ -uint -xfs_iroundup( - uint v) -{ - int i; - uint m; - - if ((v & (v - 1)) == 0) - return v; - ASSERT((v & 0x80000000) == 0); - if ((v & (v + 1)) == 0) - return v + 1; - for (i = 0, m = 1; i < 31; i++, m <<= 1) { - if (v & m) - continue; - v |= m; - if ((v & (v + 1)) == 0) - return v + 1; - } - ASSERT(0); - return( 0 ); -} - #ifdef XFS_ILOCK_TRACE ktrace_t *xfs_ilock_trace_buf; @@ -3646,7 +3515,7 @@ xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) /* * Return a pointer to the extent record at file index idx. */ -xfs_bmbt_rec_t * +xfs_bmbt_rec_host_t * xfs_iext_get_ext( xfs_ifork_t *ifp, /* inode fork pointer */ xfs_extnum_t idx) /* index of target extent */ @@ -3679,15 +3548,12 @@ xfs_iext_insert( xfs_extnum_t count, /* number of inserted items */ xfs_bmbt_irec_t *new) /* items to insert */ { - xfs_bmbt_rec_t *ep; /* extent record pointer */ xfs_extnum_t i; /* extent record index */ ASSERT(ifp->if_flags & XFS_IFEXTENTS); xfs_iext_add(ifp, idx, count); - for (i = idx; i < idx + count; i++, new++) { - ep = xfs_iext_get_ext(ifp, i); - xfs_bmbt_set_all(ep, new); - } + for (i = idx; i < idx + count; i++, new++) + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); } /* @@ -3841,7 +3707,7 @@ xfs_iext_add_indirect_multi( * (all extents past */ if (nex2) { byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); erp->er_extcount -= nex2; xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); @@ -3907,7 +3773,7 @@ xfs_iext_add_indirect_multi( erp = xfs_iext_irec_new(ifp, erp_idx); } memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep, byte_diff); + kmem_free(nex2_ep); erp->er_extcount += nex2; xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); } @@ -4134,15 +4000,14 @@ xfs_iext_realloc_direct( ifp->if_bytes = new_size; return; } - if ((new_size & (new_size - 1)) != 0) { - rnew_size = xfs_iroundup(new_size); + if (!is_power_of_2(new_size)){ + rnew_size = roundup_pow_of_two(new_size); } if (rnew_size != ifp->if_real_bytes) { - ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) + ifp->if_u1.if_extents = kmem_realloc(ifp->if_u1.if_extents, rnew_size, - ifp->if_real_bytes, - KM_SLEEP); + ifp->if_real_bytes, KM_NOFS); } if (rnew_size > ifp->if_real_bytes) { memset(&ifp->if_u1.if_extents[ifp->if_bytes / @@ -4157,8 +4022,8 @@ xfs_iext_realloc_direct( */ else { new_size += ifp->if_bytes; - if ((new_size & (new_size - 1)) != 0) { - rnew_size = xfs_iroundup(new_size); + if (!is_power_of_2(new_size)) { + rnew_size = roundup_pow_of_two(new_size); } xfs_iext_inline_to_direct(ifp, rnew_size); } @@ -4183,7 +4048,7 @@ xfs_iext_direct_to_inline( */ memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_extents); ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; ifp->if_real_bytes = 0; } @@ -4201,8 +4066,7 @@ xfs_iext_inline_to_direct( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* number of extents in file */ { - ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) - kmem_alloc(new_size, KM_SLEEP); + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); memset(ifp->if_u1.if_extents, 0, new_size); if (ifp->if_bytes) { memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, @@ -4234,7 +4098,7 @@ xfs_iext_realloc_indirect( } else { ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_SLEEP); + new_size, size, KM_NOFS); } } @@ -4245,7 +4109,7 @@ void xfs_iext_indirect_to_direct( xfs_ifork_t *ifp) /* inode fork pointer */ { - xfs_bmbt_rec_t *ep; /* extent record pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ xfs_extnum_t nextents; /* number of extents in file */ int size; /* size of file extents */ @@ -4254,11 +4118,11 @@ xfs_iext_indirect_to_direct( ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact_full(ifp); + xfs_iext_irec_compact_pages(ifp); ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t)); + kmem_free(ifp->if_u1.if_ext_irec); ifp->if_flags &= ~XFS_IFEXTIREC; ifp->if_u1.if_extents = ep; ifp->if_bytes = size; @@ -4284,7 +4148,7 @@ xfs_iext_destroy( } ifp->if_flags &= ~XFS_IFEXTIREC; } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); + kmem_free(ifp->if_u1.if_extents); } else if (ifp->if_bytes) { memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)); @@ -4297,15 +4161,15 @@ xfs_iext_destroy( /* * Return a pointer to the extent record for file system block bno. */ -xfs_bmbt_rec_t * /* pointer to found extent record */ +xfs_bmbt_rec_host_t * /* pointer to found extent record */ xfs_iext_bno_to_ext( xfs_ifork_t *ifp, /* inode fork pointer */ xfs_fileoff_t bno, /* block number to search for */ xfs_extnum_t *idxp) /* index of target extent */ { - xfs_bmbt_rec_t *base; /* pointer to first extent */ + xfs_bmbt_rec_host_t *base; /* pointer to first extent */ xfs_filblks_t blockcount = 0; /* number of blocks in extent */ - xfs_bmbt_rec_t *ep = NULL; /* pointer to target extent */ + xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ int high; /* upper boundary in search */ xfs_extnum_t idx = 0; /* index of target extent */ @@ -4476,12 +4340,10 @@ xfs_iext_irec_init( nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); ASSERT(nextents <= XFS_LINEAR_EXTS); - erp = (xfs_ext_irec_t *) - kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP); + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); if (nextents == 0) { - ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) - kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); } else if (!ifp->if_real_bytes) { xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { @@ -4529,8 +4391,7 @@ xfs_iext_irec_new( /* Initialize new extent record */ erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = (xfs_bmbt_rec_t *) - kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); erp[erp_idx].er_extcount = 0; @@ -4557,7 +4418,7 @@ xfs_iext_irec_remove( if (erp->er_extbuf) { xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -erp->er_extcount); - kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ); + kmem_free(erp->er_extbuf); } /* Compact extent records */ erp = ifp->if_u1.if_ext_irec; @@ -4575,8 +4436,7 @@ xfs_iext_irec_remove( xfs_iext_realloc_indirect(ifp, nlists * sizeof(xfs_ext_irec_t)); } else { - kmem_free(ifp->if_u1.if_ext_irec, - sizeof(xfs_ext_irec_t)); + kmem_free(ifp->if_u1.if_ext_irec); } ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; } @@ -4589,8 +4449,7 @@ xfs_iext_irec_remove( * compaction policy is as follows: * * Full Compaction: Extents fit into a single page (or inline buffer) - * Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space + * Partial Compaction: Extents occupy less than 50% of allocated space * No Compaction: Extents occupy at least 50% of allocated space */ void @@ -4611,8 +4470,6 @@ xfs_iext_irec_compact( xfs_iext_direct_to_inline(ifp, nextents); } else if (nextents <= XFS_LINEAR_EXTS) { xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { - xfs_iext_irec_compact_full(ifp); } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { xfs_iext_irec_compact_pages(ifp); } @@ -4636,7 +4493,7 @@ xfs_iext_irec_compact_pages( erp_next = erp + 1; if (erp_next->er_extcount <= (XFS_LINEAR_EXTS - erp->er_extcount)) { - memmove(&erp->er_extbuf[erp->er_extcount], + memcpy(&erp->er_extbuf[erp->er_extcount], erp_next->er_extbuf, erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); erp->er_extcount += erp_next->er_extcount; @@ -4645,7 +4502,7 @@ xfs_iext_irec_compact_pages( * so er_extoffs don't get modified in * xfs_iext_irec_remove. */ - kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ); + kmem_free(erp_next->er_extbuf); erp_next->er_extbuf = NULL; xfs_iext_irec_remove(ifp, erp_idx + 1); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; @@ -4656,68 +4513,6 @@ xfs_iext_irec_compact_pages( } /* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_t *ep, *ep_next; /* extent record pointers */ - xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ - int erp_idx = 0; /* extent irec index */ - int ext_avail; /* empty entries in ex list */ - int ext_diff; /* number of exts to add */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = ifp->if_u1.if_ext_irec; - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - while (erp_idx < nlists - 1) { - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - ext_diff = MIN(ext_avail, erp_next->er_extcount); - memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += ext_diff; - erp_next->er_extcount -= ext_diff; - /* Remove next page */ - if (erp_next->er_extcount == 0) { - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf, - erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - /* Update next page */ - } else { - /* Move rest of page up to become next new page */ - memmove(erp_next->er_extbuf, ep_next, - erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); - ep_next = erp_next->er_extbuf; - memset(&ep_next[erp_next->er_extcount], 0, - (XFS_LINEAR_EXTS - erp_next->er_extcount) * - sizeof(xfs_bmbt_rec_t)); - } - if (erp->er_extcount == XFS_LINEAR_EXTS) { - erp_idx++; - if (erp_idx < nlists) - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - else - break; - } - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - } -} - -/* * This is called to update the er_extoff field in the indirection * array when extents have been added or removed from one of the * extent lists. erp_idx contains the irec index to begin updating diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index d10b76e..1420c49 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -18,6 +18,10 @@ #ifndef __XFS_INODE_H__ #define __XFS_INODE_H__ +struct xfs_dinode; +struct xfs_dinode_core; + + /* * Fork identifiers. */ @@ -44,7 +48,7 @@ * it is possible that a third level of indirection may be required. */ typedef struct xfs_ext_irec { - xfs_bmbt_rec_t *er_extbuf; /* block of extent records */ + xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ xfs_extnum_t er_extoff; /* extent offset in file */ xfs_extnum_t er_extcount; /* number of extents in page/block */ } xfs_ext_irec_t; @@ -65,12 +69,12 @@ typedef struct xfs_ifork { unsigned char if_ext_max; /* max # of extent records */ xfs_extnum_t if_lastex; /* last if_extents used */ union { - xfs_bmbt_rec_t *if_extents; /* linear map file exts */ + xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ char *if_data; /* inline file data */ } if_u1; union { - xfs_bmbt_rec_t if_inline_ext[XFS_INLINE_EXTS]; + xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; /* very small file extents */ char if_inline_data[XFS_INLINE_DATA]; /* very small file data */ @@ -83,8 +87,7 @@ typedef struct xfs_ifork { * Flags for xfs_ichgtime(). */ #define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_ACC 0x2 /* data fork access timestamp */ -#define XFS_ICHGTIME_CHG 0x4 /* inode field change timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ /* * Per-fork incore inode flags. @@ -102,7 +105,6 @@ typedef struct xfs_ifork { #ifdef __KERNEL__ struct bhv_desc; -struct bhv_vnode; struct cred; struct ktrace; struct xfs_buf; @@ -129,81 +131,6 @@ typedef struct dm_attrs_s { __uint16_t da_pad; /* DMIG extra padding */ } dm_attrs_t; -typedef struct xfs_iocore { - void *io_obj; /* pointer to container - * inode or dcxvn structure */ - struct xfs_mount *io_mount; /* fs mount struct ptr */ -#ifdef DEBUG - mrlock_t *io_lock; /* inode IO lock */ - mrlock_t *io_iolock; /* inode IO lock */ -#endif - - /* I/O state */ - xfs_fsize_t io_new_size; /* sz when write completes */ - - /* Miscellaneous state. */ - unsigned int io_flags; /* IO related flags */ - - /* DMAPI state */ - dm_attrs_t io_dmattrs; - -} xfs_iocore_t; - -#define io_dmevmask io_dmattrs.da_dmevmask -#define io_dmstate io_dmattrs.da_dmstate - -#define XFS_IO_INODE(io) ((xfs_inode_t *) ((io)->io_obj)) -#define XFS_IO_DCXVN(io) ((dcxvn_t *) ((io)->io_obj)) - -/* - * Flags in the flags field - */ - -#define XFS_IOCORE_RT 0x1 - -/* - * xfs_iocore prototypes - */ - -extern void xfs_iocore_inode_init(struct xfs_inode *); -extern void xfs_iocore_inode_reinit(struct xfs_inode *); - - -/* - * This is the type used in the xfs inode hash table. - * An array of these is allocated for each mounted - * file system to hash the inodes for that file system. - */ -typedef struct xfs_ihash { - struct xfs_inode *ih_next; - rwlock_t ih_lock; - uint ih_version; -} xfs_ihash_t; - -#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize)) - -/* - * This is the xfs inode cluster hash. This hash is used by xfs_iflush to - * find inodes that share a cluster and can be flushed to disk at the same - * time. - */ -typedef struct xfs_chashlist { - struct xfs_chashlist *chl_next; - struct xfs_chashlist *chl_prev; - struct xfs_inode *chl_ip; - xfs_daddr_t chl_blkno; /* starting block number of - * the cluster */ - struct xfs_buf *chl_buf; /* the inode buffer */ -} xfs_chashlist_t; - -typedef struct xfs_chash { - xfs_chashlist_t *ch_list; - lock_t ch_lock; -} xfs_chash_t; - -#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize)) - - /* * This is the xfs in-core inode structure. * Most of the on-disk inode is embedded in the i_d field. @@ -227,25 +154,56 @@ typedef struct xfs_chash { * chain off the mount structure by xfs_sync calls. */ +typedef struct xfs_ictimestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_ictimestamp_t; + +/* + * NOTE: This structure must be kept identical to struct xfs_dinode_core + * in xfs_dinode.h except for the endianess annotations. + */ +typedef struct xfs_icdinode { + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid; /* owner's project id */ + __uint8_t di_pad[8]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ + xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ +} xfs_icdinode_t; + typedef struct { - struct xfs_ihash *ip_hash; /* pointer to hash header */ - struct xfs_inode *ip_next; /* inode hash link forw */ struct xfs_inode *ip_mnext; /* next inode in mount list */ struct xfs_inode *ip_mprev; /* ptr to prev inode */ - struct xfs_inode **ip_prevp; /* ptr to prev i_next */ struct xfs_mount *ip_mount; /* fs mount struct ptr */ } xfs_iptr_t; typedef struct xfs_inode { /* Inode linking and identification information. */ - struct xfs_ihash *i_hash; /* pointer to hash header */ - struct xfs_inode *i_next; /* inode hash link forw */ struct xfs_inode *i_mnext; /* next inode in mount list */ struct xfs_inode *i_mprev; /* ptr to prev inode */ - struct xfs_inode **i_prevp; /* ptr to prev i_next */ struct xfs_mount *i_mount; /* fs mount struct ptr */ struct list_head i_reclaim; /* reclaim list */ - struct bhv_desc i_bhv_desc; /* inode behavior descriptor*/ + struct inode *i_vnode; /* vnode backpointer */ struct xfs_dquot *i_udquot; /* user dquot */ struct xfs_dquot *i_gdquot; /* group dquot */ @@ -264,16 +222,10 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ - sema_t i_flock; /* inode flush lock */ + struct completion i_flush; /* inode flush completion q */ atomic_t i_pincount; /* inode pin count */ wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ -#ifdef HAVE_REFCACHE - struct xfs_inode **i_refcache; /* ptr to entry in ref cache */ - struct xfs_inode *i_release; /* inode to unref */ -#endif - /* I/O state */ - xfs_iocore_t i_iocore; /* I/O core */ - + spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ unsigned short i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ @@ -281,12 +233,15 @@ typedef struct xfs_inode { unsigned int i_gen; /* generation count */ unsigned int i_delayed_blks; /* count of delay alloc blks */ - xfs_dinode_core_t i_d; /* most of ondisk inode */ - xfs_chashlist_t *i_chash; /* cluster hash list header */ - struct xfs_inode *i_cnext; /* cluster hash link forward */ - struct xfs_inode *i_cprev; /* cluster hash link backward */ + xfs_icdinode_t i_d; /* most of ondisk inode */ + xfs_fsize_t i_size; /* in-memory size */ + xfs_fsize_t i_new_size; /* size when write completes */ + atomic_t i_iocount; /* outstanding I/O count */ /* Trace buffers per inode. */ +#ifdef XFS_INODE_TRACE + struct ktrace *i_trace; /* general inode trace */ +#endif #ifdef XFS_BMAP_TRACE struct ktrace *i_xtrace; /* inode extent list trace */ #endif @@ -304,23 +259,116 @@ typedef struct xfs_inode { #endif } xfs_inode_t; +#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ + (ip)->i_size : (ip)->i_d.di_size; + +/* Convert from vfs inode to xfs inode */ +static inline struct xfs_inode *XFS_I(struct inode *inode) +{ + return (struct xfs_inode *)inode->i_private; +} + +/* convert from xfs inode to vfs inode */ +static inline struct inode *VFS_I(struct xfs_inode *ip) +{ + return (struct inode *)ip->i_vnode; +} + +/* + * i_flags helper functions + */ +static inline void +__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + ip->i_flags |= flags; +} + +static inline void +xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + __xfs_iflags_set(ip, flags); + spin_unlock(&ip->i_flags_lock); +} + +static inline void +xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); +} + +static inline int +__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + return (ip->i_flags & flags); +} + +static inline int +xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + spin_lock(&ip->i_flags_lock); + ret = __xfs_iflags_test(ip, flags); + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (ret) + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} #endif /* __KERNEL__ */ /* * Fork handling. */ -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? &(ip)->i_df : (ip)->i_afp) -#define XFS_IFORK_Q(ip) XFS_CFORK_Q(&(ip)->i_d) -#define XFS_IFORK_DSIZE(ip) XFS_CFORK_DSIZE(&ip->i_d, ip->i_mount) -#define XFS_IFORK_ASIZE(ip) XFS_CFORK_ASIZE(&ip->i_d, ip->i_mount) -#define XFS_IFORK_SIZE(ip,w) XFS_CFORK_SIZE(&ip->i_d, ip->i_mount, w) -#define XFS_IFORK_FORMAT(ip,w) XFS_CFORK_FORMAT(&ip->i_d, w) -#define XFS_IFORK_FMT_SET(ip,w,n) XFS_CFORK_FMT_SET(&ip->i_d, w, n) -#define XFS_IFORK_NEXTENTS(ip,w) XFS_CFORK_NEXTENTS(&ip->i_d, w) -#define XFS_IFORK_NEXT_SET(ip,w,n) XFS_CFORK_NEXT_SET(&ip->i_d, w, n) +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) #ifdef __KERNEL__ @@ -334,29 +382,57 @@ typedef struct xfs_inode { #define XFS_ISTALE 0x0010 /* inode has been staled */ #define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ #define XFS_INEW 0x0040 +#define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */ +#define XFS_IMODIFIED 0x0100 /* XFS inode state possibly differs */ + /* to the Linux inode state. */ +#define XFS_ITRUNCATED 0x0200 /* truncated down so flush-on-close */ /* * Flags for inode locking. + * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) + * 1<<16 - 1<<32-1 -- lockdep annotation (integers) + */ +#define XFS_IOLOCK_EXCL (1<<0) +#define XFS_IOLOCK_SHARED (1<<1) +#define XFS_ILOCK_EXCL (1<<2) +#define XFS_ILOCK_SHARED (1<<3) +#define XFS_IUNLOCK_NONOTIFY (1<<4) + +#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) + +/* + * Flags for lockdep annotations. + * + * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes + * (ie directory operations that require locking a directory inode and + * an entry inode). The first inode gets locked with this flag so it + * gets a lockdep subclass of 1 and the second lock will have a lockdep + * subclass of 0. + * + * XFS_LOCK_INUMORDER - for locking several inodes at the some time + * with xfs_lock_inodes(). This flag is used as the starting subclass + * and each subsequent lock acquired will increment the subclass by one. + * So the first lock acquired will have a lockdep subclass of 2, the + * second lock will have a lockdep subclass of 3, and so on. It is + * the responsibility of the class builder to shift this to the correct + * portion of the lock_mode lockdep mask. */ -#define XFS_IOLOCK_EXCL 0x001 -#define XFS_IOLOCK_SHARED 0x002 -#define XFS_ILOCK_EXCL 0x004 -#define XFS_ILOCK_SHARED 0x008 -#define XFS_IUNLOCK_NONOTIFY 0x010 -/* XFS_IOLOCK_NESTED 0x020 */ -#define XFS_EXTENT_TOKEN_RD 0x040 -#define XFS_SIZE_TOKEN_RD 0x080 -#define XFS_EXTSIZE_RD (XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD) -#define XFS_WILLLEND 0x100 /* Always acquire tokens for lending */ -#define XFS_EXTENT_TOKEN_WR (XFS_EXTENT_TOKEN_RD | XFS_WILLLEND) -#define XFS_SIZE_TOKEN_WR (XFS_SIZE_TOKEN_RD | XFS_WILLLEND) -#define XFS_EXTSIZE_WR (XFS_EXTSIZE_RD | XFS_WILLLEND) -/* XFS_SIZE_TOKEN_WANT 0x200 */ - -#define XFS_LOCK_MASK \ - (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \ - XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \ - XFS_WILLLEND) +#define XFS_LOCK_PARENT 1 +#define XFS_LOCK_INUMORDER 2 + +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) + +#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) + +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) /* * Flags for xfs_iflush() @@ -366,6 +442,7 @@ typedef struct xfs_inode { #define XFS_IFLUSH_SYNC 3 #define XFS_IFLUSH_ASYNC 4 #define XFS_IFLUSH_DELWRI 5 +#define XFS_IFLUSH_ASYNC_NOBLOCK 6 /* * Flags for xfs_itruncate_start(). @@ -373,34 +450,28 @@ typedef struct xfs_inode { #define XFS_ITRUNC_DEFINITE 0x1 #define XFS_ITRUNC_MAYBE 0x2 -#define XFS_ITOV(ip) BHV_TO_VNODE(XFS_ITOBHV(ip)) -#define XFS_ITOV_NULL(ip) BHV_TO_VNODE_NULL(XFS_ITOBHV(ip)) -#define XFS_ITOBHV(ip) ((struct bhv_desc *)(&((ip)->i_bhv_desc))) -#define XFS_BHVTOI(bhvp) ((xfs_inode_t *)((char *)(bhvp) - \ - (char *)&(((xfs_inode_t *)0)->i_bhv_desc))) -#define BHV_IS_XFS(bdp) (BHV_OPS(bdp) == &xfs_vnodeops) - /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and * new subdirectory gets S_ISGID bit from parent. */ -#define XFS_INHERIT_GID(pip, vfsp) \ - (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID)) +#define XFS_INHERIT_GID(pip) \ + (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ + ((pip)->i_d.di_mode & S_ISGID)) /* - * xfs_iget.c prototypes. + * Flags for xfs_iget() */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_BULKSTAT 0x2 -#define IGET_CREATE 1 - +/* + * xfs_iget.c prototypes. + */ void xfs_ihash_init(struct xfs_mount *); void xfs_ihash_free(struct xfs_mount *); -void xfs_chash_init(struct xfs_mount *); -void xfs_chash_free(struct xfs_mount *); xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t, struct xfs_trans *); -void xfs_inode_lock_init(xfs_inode_t *, struct bhv_vnode *); int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, uint, uint, xfs_inode_t **, xfs_daddr_t); void xfs_iput(xfs_inode_t *, uint); @@ -409,11 +480,9 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -void xfs_iflock(xfs_inode_t *); -int xfs_iflock_nowait(xfs_inode_t *); +int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_ifunlock(xfs_inode_t *); void xfs_ireclaim(xfs_inode_t *); int xfs_finish_reclaim(xfs_inode_t *, int, int); int xfs_finish_reclaim_all(struct xfs_mount *, int); @@ -422,27 +491,27 @@ int xfs_finish_reclaim_all(struct xfs_mount *, int); * xfs_inode.c prototypes. */ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, - xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **, - xfs_daddr_t, uint); + xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **, + xfs_daddr_t, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - xfs_inode_t **, xfs_daddr_t); + xfs_inode_t **, xfs_daddr_t, uint); int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); -void xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *, - int); +void xfs_dinode_from_disk(struct xfs_icdinode *, + struct xfs_dinode_core *); +void xfs_dinode_to_disk(struct xfs_dinode_core *, + struct xfs_icdinode *); + uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode_core *); +uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); -void xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); +int xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, xfs_fsize_t, int, int); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); -int xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *); -void xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *, - xfs_fsize_t, int); void xfs_idestroy_fork(xfs_inode_t *, int); void xfs_idestroy(xfs_inode_t *); @@ -455,17 +524,15 @@ void xfs_iunpin(xfs_inode_t *); int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int); int xfs_iflush(xfs_inode_t *, uint); void xfs_iflush_all(struct xfs_mount *); -int xfs_iaccess(xfs_inode_t *, mode_t, cred_t *); -uint xfs_iroundup(uint); void xfs_ichgtime(xfs_inode_t *, int); xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); -void xfs_lock_inodes(xfs_inode_t **, int, int, uint); - -xfs_inode_t *xfs_vtoi(struct bhv_vnode *vp); +void xfs_lock_inodes(xfs_inode_t **, int, uint); +void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); void xfs_synchronize_atime(xfs_inode_t *); +void xfs_mark_inode_dirty_sync(xfs_inode_t *); -xfs_bmbt_rec_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); +xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t, xfs_bmbt_irec_t *); void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int); @@ -480,7 +547,7 @@ void xfs_iext_indirect_to_direct(xfs_ifork_t *); void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); void xfs_iext_inline_to_direct(xfs_ifork_t *, int); void xfs_iext_destroy(xfs_ifork_t *); -xfs_bmbt_rec_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *); +xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *); xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *); xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int); void xfs_iext_irec_init(xfs_ifork_t *); @@ -505,11 +572,30 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ -extern struct kmem_zone *xfs_chashlist_zone; extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; +/* + * Manage the i_flush queue embedded in the inode. This completion + * queue synchronizes processes attempting to flush the in-core + * inode back to disk. + */ +static inline void xfs_iflock(xfs_inode_t *ip) +{ + wait_for_completion(&ip->i_flush); +} + +static inline int xfs_iflock_nowait(xfs_inode_t *ip) +{ + return try_wait_for_completion(&ip->i_flush); +} + +static inline void xfs_ifunlock(xfs_inode_t *ip) +{ + complete(&ip->i_flush); +} + #endif /* __KERNEL__ */ #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f8e80d8..97c7452 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -40,6 +40,7 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_rw.h" +#include "xfs_error.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -274,6 +275,11 @@ xfs_inode_item_format( */ xfs_synchronize_atime(ip); + /* + * make sure the linux inode is dirty + */ + xfs_mark_inode_dirty_sync(ip); + vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(xfs_dinode_core_t); XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); @@ -291,9 +297,9 @@ xfs_inode_item_format( */ mp = ip->i_mount; ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || - XFS_SB_VERSION_HASNLINK(&mp->m_sb)); + xfs_sb_version_hasnlink(&mp->m_sb)); if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { /* * Convert it back. */ @@ -541,7 +547,7 @@ STATIC void xfs_inode_item_pin( xfs_inode_log_item_t *iip) { - ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); + ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); xfs_ipin(iip->ili_inode); } @@ -615,7 +621,7 @@ xfs_inode_item_trylock( return XFS_ITEM_PUSHBUF; } else { /* - * We hold the AIL_LOCK, so we must specify the + * We hold the AIL lock, so we must specify the * NONOTIFY flag so that we won't double trip. */ xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); @@ -658,13 +664,13 @@ xfs_inode_item_unlock( ASSERT(iip != NULL); ASSERT(iip->ili_inode->i_itemp != NULL); - ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); + ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); ASSERT((!(iip->ili_inode->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)) || - ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE)); + xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL)); ASSERT((!(iip->ili_inode->i_itemp->ili_flags & XFS_ILI_IOLOCKED_SHARED)) || - ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS)); + xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED)); /* * Clear the transaction pointer in the inode. */ @@ -680,7 +686,7 @@ xfs_inode_item_unlock( ASSERT(ip->i_d.di_nextents > 0); ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); ASSERT(ip->i_df.if_bytes > 0); - kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes); + kmem_free(iip->ili_extents_buf); iip->ili_extents_buf = NULL; } if (iip->ili_aextents_buf != NULL) { @@ -688,7 +694,7 @@ xfs_inode_item_unlock( ASSERT(ip->i_d.di_anextents > 0); ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); ASSERT(ip->i_afp->if_bytes > 0); - kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes); + kmem_free(iip->ili_aextents_buf); iip->ili_aextents_buf = NULL; } @@ -743,28 +749,13 @@ xfs_inode_item_committed( } /* - * The transaction with the inode locked has aborted. The inode - * must not be dirty within the transaction (unless we're forcibly - * shutting down). We simply unlock just as if the transaction - * had been cancelled. - */ -STATIC void -xfs_inode_item_abort( - xfs_inode_log_item_t *iip) -{ - xfs_inode_item_unlock(iip); - return; -} - - -/* * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's * marked delayed write. If that's the case, we'll initiate a bawrite on that * buffer to expedite the process. * - * We aren't holding the AIL_LOCK (or the flush lock) when this gets called, + * We aren't holding the AIL lock (or the flush lock) when this gets called, * so it is inherently race-y. */ STATIC void @@ -778,7 +769,7 @@ xfs_inode_item_pushbuf( ip = iip->ili_inode; - ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); /* * The ili_pushbuf_flag keeps others from @@ -788,11 +779,10 @@ xfs_inode_item_pushbuf( ASSERT(iip->ili_push_owner == current_pid()); /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. + * If a flush is not in progress anymore, chances are that the + * inode was taken off the AIL. So, just get out. */ - if (!issemalocked(&(ip->i_flock)) || + if (completion_done(&ip->i_flush) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -807,14 +797,14 @@ xfs_inode_item_pushbuf( if (XFS_BUF_ISDELAYWRITE(bp)) { /* * We were racing with iflush because we don't hold - * the AIL_LOCK or the flush lock. However, at this point, + * the AIL lock or the flush lock. However, at this point, * we have the buffer, and we know that it's dirty. * So, it's possible that iflush raced with us, and * this item is already taken off the AIL. * If not, we can flush it async. */ dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && - issemalocked(&(ip->i_flock))); + !completion_done(&ip->i_flush)); iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buftrace("INODE ITEM PUSH", bp); @@ -823,7 +813,12 @@ xfs_inode_item_pushbuf( XFS_LOG_FORCE); } if (dopush) { - xfs_bawrite(mp, bp); + int error; + error = xfs_bawrite(mp, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p", + error, iip, bp); } else { xfs_buf_relse(bp); } @@ -861,8 +856,8 @@ xfs_inode_item_push( ip = iip->ili_inode; - ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); - ASSERT(issemalocked(&(ip->i_flock))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); + ASSERT(!completion_done(&ip->i_flush)); /* * Since we were able to lock the inode's flush lock and * we found it on the AIL, the inode must be dirty. This @@ -902,7 +897,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -STATIC struct xfs_item_ops xfs_inode_item_ops = { +static struct xfs_item_ops xfs_inode_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_inode_item_format, @@ -915,7 +910,6 @@ STATIC struct xfs_item_ops xfs_inode_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_inode_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_inode_item_abort, .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_inode_item_committing @@ -962,8 +956,7 @@ xfs_inode_item_destroy( { #ifdef XFS_TRANS_DEBUG if (ip->i_itemp->ili_root_size != 0) { - kmem_free(ip->i_itemp->ili_orig_root, - ip->i_itemp->ili_root_size); + kmem_free(ip->i_itemp->ili_orig_root); } #endif kmem_zone_free(xfs_ili_zone, ip->i_itemp); @@ -984,7 +977,6 @@ xfs_iflush_done( xfs_inode_log_item_t *iip) { xfs_inode_t *ip; - SPLDECL(s); ip = iip->ili_inode; @@ -999,15 +991,15 @@ xfs_iflush_done( */ if (iip->ili_logged && (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { - AIL_LOCK(ip->i_mount, s); + spin_lock(&ip->i_mount->m_ail_lock); if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { /* * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(ip->i_mount, - (xfs_log_item_t*)iip, s); + (xfs_log_item_t*)iip); } else { - AIL_UNLOCK(ip->i_mount, s); + spin_unlock(&ip->i_mount->m_ail_lock); } } @@ -1041,21 +1033,19 @@ xfs_iflush_abort( { xfs_inode_log_item_t *iip; xfs_mount_t *mp; - SPLDECL(s); iip = ip->i_itemp; mp = ip->i_mount; if (iip) { if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - AIL_LOCK(mp, s); + spin_lock(&mp->m_ail_lock); if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { /* * xfs_trans_delete_ail() drops the AIL lock. */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip, - s); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip); } else - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); } iip->ili_logged = 0; /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 5db6cd1..4051307 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -25,52 +25,54 @@ * must be added on to the end. */ typedef struct xfs_inode_log_format { - unsigned short ilf_type; /* inode log item type */ - unsigned short ilf_size; /* size of this item */ - uint ilf_fields; /* flags for fields logged */ - ushort ilf_asize; /* size of attr d/ext/root */ - ushort ilf_dsize; /* size of data/ext/root */ - xfs_ino_t ilf_ino; /* inode number */ + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ union { - xfs_dev_t ilfu_rdev; /* rdev value for dev inode*/ + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; __int64_t ilf_blkno; /* blkno of inode buffer */ - int ilf_len; /* len of inode buffer */ - int ilf_boffset; /* off of inode in buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; +#ifndef HAVE_FORMAT32 typedef struct xfs_inode_log_format_32 { - unsigned short ilf_type; /* 16: inode log item type */ - unsigned short ilf_size; /* 16: size of this item */ - uint ilf_fields; /* 32: flags for fields logged */ - ushort ilf_asize; /* 32: size of attr d/ext/root */ - ushort ilf_dsize; /* 32: size of data/ext/root */ - xfs_ino_t ilf_ino; /* 64: inode number */ + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ union { - xfs_dev_t ilfu_rdev; /* 32: rdev value for dev inode*/ - uuid_t ilfu_uuid; /* 128: mount point value */ + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* 64: blkno of inode buffer */ - int ilf_len; /* 32: len of inode buffer */ - int ilf_boffset; /* 32: off of inode in buffer */ + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ } __attribute__((packed)) xfs_inode_log_format_32_t; +#endif typedef struct xfs_inode_log_format_64 { - unsigned short ilf_type; /* 16: inode log item type */ - unsigned short ilf_size; /* 16: size of this item */ - uint ilf_fields; /* 32: flags for fields logged */ - ushort ilf_asize; /* 32: size of attr d/ext/root */ - ushort ilf_dsize; /* 32: size of data/ext/root */ - __uint32_t ilf_pad; /* 32: pad for 64 bit boundary */ - xfs_ino_t ilf_ino; /* 64: inode number */ + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ union { - xfs_dev_t ilfu_rdev; /* 32: rdev value for dev inode*/ - uuid_t ilfu_uuid; /* 128: mount point value */ + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* 64: blkno of inode buffer */ - int ilf_len; /* 32: len of inode buffer */ - int ilf_boffset; /* 32: off of inode in buffer */ + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_64_t; /* @@ -166,6 +168,14 @@ static inline int xfs_ilog_fext(int w) return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); } +static inline int xfs_inode_clean(xfs_inode_t *ip) +{ + return (!ip->i_itemp || + !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && + !ip->i_update_core; +} + + #ifdef __KERNEL__ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c deleted file mode 100644 index 06d710c..0000000 --- a/fs/xfs/xfs_iocore.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dfrag.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_quota.h" -#include "xfs_trans_space.h" -#include "xfs_iomap.h" - - -STATIC xfs_fsize_t -xfs_size_fn( - xfs_inode_t *ip) -{ - return (ip->i_d.di_size); -} - -STATIC int -xfs_ioinit( - struct bhv_vfs *vfsp, - struct xfs_mount_args *mntargs, - int flags) -{ - return xfs_mountfs(vfsp, XFS_VFSTOM(vfsp), flags); -} - -xfs_ioops_t xfs_iocore_xfs = { - .xfs_ioinit = (xfs_ioinit_t) xfs_ioinit, - .xfs_bmapi_func = (xfs_bmapi_t) xfs_bmapi, - .xfs_bunmapi_func = (xfs_bunmapi_t) xfs_bunmapi, - .xfs_bmap_eof_func = (xfs_bmap_eof_t) xfs_bmap_eof, - .xfs_iomap_write_direct = - (xfs_iomap_write_direct_t) xfs_iomap_write_direct, - .xfs_iomap_write_delay = - (xfs_iomap_write_delay_t) xfs_iomap_write_delay, - .xfs_iomap_write_allocate = - (xfs_iomap_write_allocate_t) xfs_iomap_write_allocate, - .xfs_iomap_write_unwritten = - (xfs_iomap_write_unwritten_t) xfs_iomap_write_unwritten, - .xfs_ilock = (xfs_lock_t) xfs_ilock, - .xfs_lck_map_shared = (xfs_lck_map_shared_t) xfs_ilock_map_shared, - .xfs_ilock_demote = (xfs_lock_demote_t) xfs_ilock_demote, - .xfs_ilock_nowait = (xfs_lock_nowait_t) xfs_ilock_nowait, - .xfs_unlock = (xfs_unlk_t) xfs_iunlock, - .xfs_size_func = (xfs_size_t) xfs_size_fn, - .xfs_iodone = (xfs_iodone_t) fs_noerr, - .xfs_swap_extents_func = (xfs_swap_extents_t) xfs_swap_extents, -}; - -void -xfs_iocore_inode_reinit( - xfs_inode_t *ip) -{ - xfs_iocore_t *io = &ip->i_iocore; - - io->io_flags = 0; - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) - io->io_flags |= XFS_IOCORE_RT; - io->io_dmevmask = ip->i_d.di_dmevmask; - io->io_dmstate = ip->i_d.di_dmstate; -} - -void -xfs_iocore_inode_init( - xfs_inode_t *ip) -{ - xfs_iocore_t *io = &ip->i_iocore; - xfs_mount_t *mp = ip->i_mount; - - io->io_mount = mp; -#ifdef DEBUG - io->io_lock = &ip->i_lock; - io->io_iolock = &ip->i_iolock; -#endif - - io->io_obj = (void *)ip; - - xfs_iocore_inode_reinit(ip); -} diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f1949c1..67f22b2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -55,12 +53,10 @@ void xfs_iomap_enter_trace( int tag, - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_off_t offset, ssize_t count) { - xfs_inode_t *ip = XFS_IO_INODE(io); - if (!ip->i_rwtrace) return; @@ -72,8 +68,8 @@ xfs_iomap_enter_trace( (void *)((unsigned long)((offset >> 32) & 0xffffffff)), (void *)((unsigned long)(offset & 0xffffffff)), (void *)((unsigned long)count), - (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(io->io_new_size & 0xffffffff)), + (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_new_size & 0xffffffff)), (void *)((unsigned long)current_pid()), (void *)NULL, (void *)NULL, @@ -86,15 +82,13 @@ xfs_iomap_enter_trace( void xfs_iomap_map_trace( int tag, - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_off_t offset, ssize_t count, xfs_iomap_t *iomapp, xfs_bmbt_irec_t *imapp, int flags) { - xfs_inode_t *ip = XFS_IO_INODE(io); - if (!ip->i_rwtrace) return; @@ -128,7 +122,7 @@ xfs_iomap_map_trace( STATIC int xfs_imap_to_bmap( - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_off_t offset, xfs_bmbt_irec_t *imap, xfs_iomap_t *iomapp, @@ -136,15 +130,10 @@ xfs_imap_to_bmap( int iomaps, /* Number of iomap entries */ int flags) { - xfs_mount_t *mp; - xfs_fsize_t nisize; + xfs_mount_t *mp = ip->i_mount; int pbm; xfs_fsblock_t start_block; - mp = io->io_mount; - nisize = XFS_SIZE(mp, io); - if (io->io_new_size > nisize) - nisize = io->io_new_size; for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); @@ -152,7 +141,7 @@ xfs_imap_to_bmap( iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); iomapp->iomap_flags = flags; - if (io->io_flags & XFS_IOCORE_RT) { + if (XFS_IS_REALTIME_INODE(ip)) { iomapp->iomap_flags |= IOMAP_REALTIME; iomapp->iomap_target = mp->m_rtdev_targp; } else { @@ -166,15 +155,11 @@ xfs_imap_to_bmap( iomapp->iomap_bn = IOMAP_DADDR_NULL; iomapp->iomap_flags |= IOMAP_DELAY; } else { - iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block); + iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block); if (ISUNWRITTEN(imap)) iomapp->iomap_flags |= IOMAP_UNWRITTEN; } - if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) { - iomapp->iomap_flags |= IOMAP_EOF; - } - offset += iomapp->iomap_bsize - iomapp->iomap_delta; } return pbm; /* Return the number filled */ @@ -182,14 +167,14 @@ xfs_imap_to_bmap( int xfs_iomap( - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_off_t offset, ssize_t count, int flags, xfs_iomap_t *iomapp, int *niomaps) { - xfs_mount_t *mp = io->io_mount; + xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb, end_fsb; int error = 0; int lockmode = 0; @@ -198,45 +183,37 @@ xfs_iomap( int bmapi_flags = 0; int iomap_flags = 0; + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - switch (flags & - (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE | - BMAPI_UNWRITTEN | BMAPI_DEVICE)) { + switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) { case BMAPI_READ: - xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count); - lockmode = XFS_LCK_MAP_SHARED(mp, io); + xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, ip, offset, count); + lockmode = xfs_ilock_map_shared(ip); bmapi_flags = XFS_BMAPI_ENTIRE; break; case BMAPI_WRITE: - xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count); - lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR; + xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count); + lockmode = XFS_ILOCK_EXCL; if (flags & BMAPI_IGNSTATE) bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; - XFS_ILOCK(mp, io, lockmode); + xfs_ilock(ip, lockmode); break; case BMAPI_ALLOCATE: - xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, io, offset, count); - lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD; + xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count); + lockmode = XFS_ILOCK_SHARED; bmapi_flags = XFS_BMAPI_ENTIRE; + /* Attempt non-blocking lock */ if (flags & BMAPI_TRYLOCK) { - if (!XFS_ILOCK_NOWAIT(mp, io, lockmode)) + if (!xfs_ilock_nowait(ip, lockmode)) return XFS_ERROR(EAGAIN); } else { - XFS_ILOCK(mp, io, lockmode); + xfs_ilock(ip, lockmode); } break; - case BMAPI_UNWRITTEN: - goto phase2; - case BMAPI_DEVICE: - lockmode = XFS_LCK_MAP_SHARED(mp, io); - iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ? - mp->m_rtdev_targp : mp->m_ddev_targp; - error = 0; - *niomaps = 1; - goto out; default: BUG(); } @@ -247,7 +224,7 @@ xfs_iomap( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = XFS_BMAPI(mp, NULL, io, offset_fsb, + error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), bmapi_flags, NULL, 0, &imap, &nimaps, NULL, NULL); @@ -255,54 +232,48 @@ xfs_iomap( if (error) goto out; -phase2: - switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) { + switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { case BMAPI_WRITE: /* If we found an extent, return it */ if (nimaps && (imap.br_startblock != HOLESTARTBLOCK) && (imap.br_startblock != DELAYSTARTBLOCK)) { - xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, + xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, offset, count, iomapp, &imap, flags); break; } if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { - error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset, - count, flags, &imap, &nimaps, nimaps); + error = xfs_iomap_write_direct(ip, offset, count, flags, + &imap, &nimaps, nimaps); } else { - error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, - flags, &imap, &nimaps); + error = xfs_iomap_write_delay(ip, offset, count, flags, + &imap, &nimaps); } if (!error) { - xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, io, + xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, ip, offset, count, iomapp, &imap, flags); } iomap_flags = IOMAP_NEW; break; case BMAPI_ALLOCATE: /* If we found an extent, return it */ - XFS_IUNLOCK(mp, io, lockmode); + xfs_iunlock(ip, lockmode); lockmode = 0; if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) { - xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, + xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, offset, count, iomapp, &imap, flags); break; } - error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, offset, count, + error = xfs_iomap_write_allocate(ip, offset, count, &imap, &nimaps); break; - case BMAPI_UNWRITTEN: - lockmode = 0; - error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count); - nimaps = 0; - break; } if (nimaps) { - *niomaps = xfs_imap_to_bmap(io, offset, &imap, + *niomaps = xfs_imap_to_bmap(ip, offset, &imap, iomapp, nimaps, *niomaps, iomap_flags); } else if (niomaps) { *niomaps = 0; @@ -310,14 +281,15 @@ phase2: out: if (lockmode) - XFS_IUNLOCK(mp, io, lockmode); + xfs_iunlock(ip, lockmode); return XFS_ERROR(error); } + STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_fsize_t isize, xfs_extlen_t extsize, xfs_fileoff_t *last_fsb) @@ -326,7 +298,7 @@ xfs_iomap_eof_align_last_fsb( xfs_extlen_t align; int eof, error; - if (io->io_flags & XFS_IOCORE_RT) + if (XFS_IS_REALTIME_INODE(ip)) ; /* * If mounted with the "-o swalloc" option, roundup the allocation @@ -357,7 +329,7 @@ xfs_iomap_eof_align_last_fsb( } if (new_last_fsb) { - error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof); + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); if (error) return error; if (eof) @@ -398,6 +370,23 @@ xfs_flush_space( return 1; } +STATIC int +xfs_cmn_err_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x\n", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return EFSCORRUPTED; +} + int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -409,7 +398,6 @@ xfs_iomap_write_direct( int found) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_fileoff_t last_fsb; xfs_filblks_t count_fsb, resaligned; @@ -436,21 +424,16 @@ xfs_iomap_write_direct( return XFS_ERROR(error); rt = XFS_IS_REALTIME_INODE(ip); - if (unlikely(rt)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } + extsz = xfs_get_extsz_hint(ip); - isize = ip->i_d.di_size; - if (io->io_new_size > isize) - isize = io->io_new_size; + isize = ip->i_size; + if (ip->i_new_size > isize) + isize = ip->i_new_size; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); if ((offset + count) > isize) { - error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, + error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz, &last_fsb); if (error) goto error_out; @@ -474,13 +457,13 @@ xfs_iomap_write_direct( if (unlikely(rt)) { resrtextents = qblocks = resaligned; resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; - } else { - resrtextents = 0; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - quota_flag = XFS_QMOPT_RES_REGBLKS; - } + quota_flag = XFS_QMOPT_RES_REGBLKS; + } /* * Allocate and setup the transaction @@ -509,7 +492,7 @@ xfs_iomap_write_direct( xfs_trans_ihold(tp, ip); bmapi_flag = XFS_BMAPI_WRITE; - if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz)) + if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -517,7 +500,7 @@ xfs_iomap_write_direct( */ XFS_BMAP_INIT(&free_list, &firstfsb); nimaps = 1; - error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, bmapi_flag, + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, &imap, &nimaps, &free_list, NULL); if (error) goto error0; @@ -525,10 +508,10 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto error0; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) goto error_out; @@ -536,23 +519,17 @@ xfs_iomap_write_direct( * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = (ENOSPC); + error = ENOSPC; + goto error_out; + } + + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) { + error = xfs_cmn_err_fsblock_zero(ip, &imap); goto error_out; } *ret_imap = imap; *nmaps = 1; - if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)ret_imap->br_startblock, - (unsigned long long)ret_imap->br_startoff, - (unsigned long long)ret_imap->br_blockcount, - ret_imap->br_state); - } return 0; error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ @@ -581,7 +558,7 @@ error_out: STATIC int xfs_iomap_eof_want_preallocate( xfs_mount_t *mp, - xfs_iocore_t *io, + xfs_inode_t *ip, xfs_fsize_t isize, xfs_off_t offset, size_t count, @@ -608,7 +585,7 @@ xfs_iomap_eof_want_preallocate( while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; - error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, 0, + error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, &firstblock, 0, imap, &imaps, NULL, NULL); if (error) return error; @@ -634,7 +611,6 @@ xfs_iomap_write_delay( int *nmaps) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_fileoff_t last_fsb; xfs_off_t aligned_offset; @@ -647,7 +623,7 @@ xfs_iomap_write_delay( int prealloc, fsynced = 0; int error; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* * Make sure that the dquots are there. This doesn't hold @@ -657,21 +633,15 @@ xfs_iomap_write_delay( if (error) return XFS_ERROR(error); - if (XFS_IS_REALTIME_INODE(ip)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - + extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); retry: - isize = ip->i_d.di_size; - if (io->io_new_size > isize) - isize = io->io_new_size; + isize = ip->i_size; + if (ip->i_new_size > isize) + isize = ip->i_new_size; - error = xfs_iomap_eof_want_preallocate(mp, io, isize, offset, count, + error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count, ioflag, imap, XFS_WRITE_IMAPS, &prealloc); if (error) return error; @@ -685,7 +655,7 @@ retry: } if (prealloc || extsz) { - error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, + error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz, &last_fsb); if (error) return error; @@ -693,7 +663,7 @@ retry: nimaps = XFS_WRITE_IMAPS; firstblock = NULLFSBLOCK; - error = XFS_BMAPI(mp, NULL, io, offset_fsb, + error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(last_fsb - offset_fsb), XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, @@ -707,7 +677,7 @@ retry: */ if (nimaps == 0) { xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, - io, offset, count); + ip, offset, count); if (xfs_flush_space(ip, &fsynced, &ioflag)) return XFS_ERROR(ENOSPC); @@ -715,17 +685,8 @@ retry: goto retry; } - if (!(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)ret_imap->br_startblock, - (unsigned long long)ret_imap->br_startoff, - (unsigned long long)ret_imap->br_blockcount, - ret_imap->br_state); - } + if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_cmn_err_fsblock_zero(ip, &imap[0]); *ret_imap = imap[0]; *nmaps = 1; @@ -739,6 +700,9 @@ retry: * the originating callers request. * * Called without a lock on the inode. + * + * We no longer bother to look at the incoming map - all we have to + * guarantee is that whatever we allocate fills the required range. */ int xfs_iomap_write_allocate( @@ -749,15 +713,14 @@ xfs_iomap_write_allocate( int *retmap) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb, last_block; xfs_fileoff_t end_fsb, map_start_fsb; xfs_fsblock_t first_block; xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; - xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS]; + xfs_bmbt_irec_t imap; xfs_trans_t *tp; - int i, nimaps, committed; + int nimaps, committed; int error = 0; int nres; @@ -788,18 +751,12 @@ xfs_iomap_write_allocate( nimaps = 0; while (nimaps == 0) { tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_reserve(tp, nres, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); - if (error == ENOSPC) { - error = xfs_trans_reserve(tp, 0, - XFS_WRITE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - } if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); @@ -810,16 +767,44 @@ xfs_iomap_write_allocate( XFS_BMAP_INIT(&free_list, &first_block); - nimaps = XFS_STRAT_WRITE_IMAPS; /* - * Ensure we don't go beyond eof - it is possible - * the extents changed since we did the read call, - * we dropped the ilock in the interim. + * it is possible that the extents have changed since + * we did the read call as we dropped the ilock for a + * while. We have to be careful about truncates or hole + * punchs here - we are not allowed to allocate + * non-delalloc blocks here. + * + * The only protection against truncation is the pages + * for the range we are being asked to convert are + * locked and hence a truncate will block on them + * first. + * + * As a result, if we go beyond the range we really + * need and hit an delalloc extent boundary followed by + * a hole while we have excess blocks in the map, we + * will fill the hole incorrectly and overrun the + * transaction reservation. + * + * Using a single map prevents this as we are forced to + * check each map we look for overlap with the desired + * range and abort as soon as we find it. Also, given + * that we only return a single map, having one beyond + * what we can return is probably a bit silly. + * + * We also need to check that we don't go beyond EOF; + * this is a truncate optimisation as a truncate sets + * the new file size before block on the pages we + * currently have locked under writeback. Because they + * are about to be tossed, we don't need to write them + * back.... */ + nimaps = 1; + end_fsb = XFS_B_TO_FSB(mp, ip->i_size); + error = xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + if (error) + goto trans_cancel; - end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size); - xfs_bmap_last_offset(NULL, ip, &last_block, - XFS_DATA_FORK); last_block = XFS_FILEOFF_MAX(last_block, end_fsb); if ((map_start_fsb + count_fsb) > last_block) { count_fsb = last_block - map_start_fsb; @@ -830,19 +815,17 @@ xfs_iomap_write_allocate( } /* Go get the actual blocks */ - error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb, + error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, XFS_BMAPI_WRITE, &first_block, 1, - imap, &nimaps, &free_list, NULL); + &imap, &nimaps, &free_list, NULL); if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, - first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto trans_cancel; - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) goto error0; @@ -853,41 +836,24 @@ xfs_iomap_write_allocate( * See if we were able to allocate an extent that * covers at least part of the callers request */ - - for (i = 0; i < nimaps; i++) { - if (!(io->io_flags & XFS_IOCORE_RT) && - !imap[i].br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: " - "fs <%s> inode: %lld " - "start_block : %llx start_off : %llx " - "blkcnt : %llx extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long) - imap[i].br_startblock, - (unsigned long long) - imap[i].br_startoff, - (unsigned long long) - imap[i].br_blockcount, - imap[i].br_state); - } - if ((offset_fsb >= imap[i].br_startoff) && - (offset_fsb < (imap[i].br_startoff + - imap[i].br_blockcount))) { - *map = imap[i]; - *retmap = 1; - XFS_STATS_INC(xs_xstrat_quick); - return 0; - } - count_fsb -= imap[i].br_blockcount; + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_cmn_err_fsblock_zero(ip, &imap); + + if ((offset_fsb >= imap.br_startoff) && + (offset_fsb < (imap.br_startoff + + imap.br_blockcount))) { + *map = imap; + *retmap = 1; + XFS_STATS_INC(xs_xstrat_quick); + return 0; } - /* So far we have not mapped the requested part of the + /* + * So far we have not mapped the requested part of the * file, just surrounding data, try again. */ - nimaps--; - map_start_fsb = imap[nimaps].br_startoff + - imap[nimaps].br_blockcount; + count_fsb -= imap.br_blockcount; + map_start_fsb = imap.br_startoff + imap.br_blockcount; } trans_cancel: @@ -905,7 +871,6 @@ xfs_iomap_write_unwritten( size_t count) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_filblks_t count_fsb; xfs_filblks_t numblks_fsb; @@ -918,13 +883,22 @@ xfs_iomap_write_unwritten( int committed; int error; - xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, - &ip->i_iocore, offset, count); + xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, ip, offset, count); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); + /* + * Reserve enough blocks in this transaction for two complete extent + * btree splits. We may be converting the middle part of an unwritten + * extent and in this case we will insert two new extents in the btree + * each of which could cause a full split. + * + * This reservation amount will be used in the first call to + * xfs_bmbt_split() to select an AG with enough space to satisfy the + * rest of the operation. + */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; do { @@ -933,15 +907,15 @@ xfs_iomap_write_unwritten( * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); - goto error0; + return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -953,33 +927,23 @@ xfs_iomap_write_unwritten( */ XFS_BMAP_INIT(&free_list, &firstfsb); nimaps = 1; - error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list, NULL); if (error) goto error_on_bmapi_transaction; - error = xfs_bmap_finish(&(tp), &(free_list), - firstfsb, &committed); + error = xfs_bmap_finish(&(tp), &(free_list), &committed); if (error) goto error_on_bmapi_transaction; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) - goto error0; - - if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> " - "inode: %lld start_block : %llx start_off : " - "%llx blkcnt : %llx extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)imap.br_startblock, - (unsigned long long)imap.br_startoff, - (unsigned long long)imap.br_blockcount, - imap.br_state); - } + return XFS_ERROR(error); + + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_cmn_err_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { /* @@ -999,6 +963,5 @@ error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: return XFS_ERROR(error); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 3ce204a..ee1a0c1 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -22,7 +22,7 @@ typedef enum { /* iomap_flags values */ - IOMAP_EOF = 0x01, /* mapping contains EOF */ + IOMAP_READ = 0, /* mapping for a read */ IOMAP_HOLE = 0x02, /* mapping covers a hole */ IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ IOMAP_REALTIME = 0x10, /* mapping on the realtime device */ @@ -36,14 +36,12 @@ typedef enum { BMAPI_READ = (1 << 0), /* read extents */ BMAPI_WRITE = (1 << 1), /* create extents */ BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */ - BMAPI_UNWRITTEN = (1 << 3), /* unwritten extents to real extents */ /* modifiers */ BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ BMAPI_SYNC = (1 << 7), /* sync write to flush delalloc space */ BMAPI_TRYLOCK = (1 << 8), /* non-blocking request */ - BMAPI_DEVICE = (1 << 9), /* we only want to know the device */ } bmapi_flags_t; @@ -73,11 +71,10 @@ typedef struct xfs_iomap { iomap_flags_t iomap_flags; } xfs_iomap_t; -struct xfs_iocore; struct xfs_inode; struct xfs_bmbt_irec; -extern int xfs_iomap(struct xfs_iocore *, xfs_off_t, ssize_t, int, +extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, struct xfs_iomap *, int *); extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, int, struct xfs_bmbt_irec *, int *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 46249e4..cf6754a 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -39,6 +39,16 @@ #include "xfs_error.h" #include "xfs_btree.h" +int +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); +} + STATIC int xfs_bulkstat_one_iget( xfs_mount_t *mp, /* mount point for filesystem */ @@ -47,12 +57,12 @@ xfs_bulkstat_one_iget( xfs_bstat_t *buf, /* return buffer */ int *stat) /* BULKSTAT_RV_... */ { - xfs_dinode_core_t *dic; /* dinode core info pointer */ + xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ - bhv_vnode_t *vp; int error; - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno); + error = xfs_iget(mp, NULL, ino, + XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno); if (error) { *stat = BULKSTAT_RV_NOTHING; return error; @@ -60,13 +70,7 @@ xfs_bulkstat_one_iget( ASSERT(ip != NULL); ASSERT(ip->i_blkno != (xfs_daddr_t)0); - if (ip->i_d.di_mode == 0) { - *stat = BULKSTAT_RV_NOTHING; - error = XFS_ERROR(ENOENT); - goto out_iput; - } - vp = XFS_ITOV(ip); dic = &ip->i_d; /* xfs_iget returns the following without needing @@ -79,7 +83,7 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - vn_atime_to_bstime(vp, &buf->bs_atime); + vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime); buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; @@ -113,12 +117,11 @@ xfs_bulkstat_one_iget( break; } - out_iput: xfs_iput(ip, XFS_ILOCK_SHARED); return error; } -STATIC int +STATIC void xfs_bulkstat_one_dinode( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ @@ -140,37 +143,37 @@ xfs_bulkstat_one_dinode( * the new format. We don't change the version number so that we * can distinguish this from a real new format inode. */ - if (INT_GET(dic->di_version, ARCH_CONVERT) == XFS_DINODE_VERSION_1) { - buf->bs_nlink = INT_GET(dic->di_onlink, ARCH_CONVERT); + if (dic->di_version == XFS_DINODE_VERSION_1) { + buf->bs_nlink = be16_to_cpu(dic->di_onlink); buf->bs_projid = 0; } else { - buf->bs_nlink = INT_GET(dic->di_nlink, ARCH_CONVERT); - buf->bs_projid = INT_GET(dic->di_projid, ARCH_CONVERT); + buf->bs_nlink = be32_to_cpu(dic->di_nlink); + buf->bs_projid = be16_to_cpu(dic->di_projid); } buf->bs_ino = ino; - buf->bs_mode = INT_GET(dic->di_mode, ARCH_CONVERT); - buf->bs_uid = INT_GET(dic->di_uid, ARCH_CONVERT); - buf->bs_gid = INT_GET(dic->di_gid, ARCH_CONVERT); - buf->bs_size = INT_GET(dic->di_size, ARCH_CONVERT); - buf->bs_atime.tv_sec = INT_GET(dic->di_atime.t_sec, ARCH_CONVERT); - buf->bs_atime.tv_nsec = INT_GET(dic->di_atime.t_nsec, ARCH_CONVERT); - buf->bs_mtime.tv_sec = INT_GET(dic->di_mtime.t_sec, ARCH_CONVERT); - buf->bs_mtime.tv_nsec = INT_GET(dic->di_mtime.t_nsec, ARCH_CONVERT); - buf->bs_ctime.tv_sec = INT_GET(dic->di_ctime.t_sec, ARCH_CONVERT); - buf->bs_ctime.tv_nsec = INT_GET(dic->di_ctime.t_nsec, ARCH_CONVERT); - buf->bs_xflags = xfs_dic2xflags(dic); - buf->bs_extsize = INT_GET(dic->di_extsize, ARCH_CONVERT) << mp->m_sb.sb_blocklog; - buf->bs_extents = INT_GET(dic->di_nextents, ARCH_CONVERT); - buf->bs_gen = INT_GET(dic->di_gen, ARCH_CONVERT); + buf->bs_mode = be16_to_cpu(dic->di_mode); + buf->bs_uid = be32_to_cpu(dic->di_uid); + buf->bs_gid = be32_to_cpu(dic->di_gid); + buf->bs_size = be64_to_cpu(dic->di_size); + buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec); + buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec); + buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec); + buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec); + buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec); + buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec); + buf->bs_xflags = xfs_dic2xflags(dip); + buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog; + buf->bs_extents = be32_to_cpu(dic->di_nextents); + buf->bs_gen = be32_to_cpu(dic->di_gen); memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); - buf->bs_dmevmask = INT_GET(dic->di_dmevmask, ARCH_CONVERT); - buf->bs_dmstate = INT_GET(dic->di_dmstate, ARCH_CONVERT); - buf->bs_aextents = INT_GET(dic->di_anextents, ARCH_CONVERT); + buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); + buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); + buf->bs_aextents = be16_to_cpu(dic->di_anextents); - switch (INT_GET(dic->di_format, ARCH_CONVERT)) { + switch (dic->di_format) { case XFS_DINODE_FMT_DEV: - buf->bs_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); + buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev); buf->bs_blksize = BLKDEV_IOSIZE; buf->bs_blocks = 0; break; @@ -184,11 +187,19 @@ xfs_bulkstat_one_dinode( case XFS_DINODE_FMT_BTREE: buf->bs_rdev = 0; buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = INT_GET(dic->di_nblocks, ARCH_CONVERT); + buf->bs_blocks = be64_to_cpu(dic->di_nblocks); break; } +} - return 0; +STATIC int +xfs_bulkstat_one_fmt( + void __user *ubuffer, + const xfs_bstat_t *buffer) +{ + if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) + return -EFAULT; + return sizeof(*buffer); } /* @@ -210,19 +221,15 @@ xfs_bulkstat_one( xfs_bstat_t *buf; /* return buffer */ int error = 0; /* error value */ xfs_dinode_t *dip; /* dinode inode pointer */ + bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt; dip = (xfs_dinode_t *)dibuff; + *stat = BULKSTAT_RV_NOTHING; - if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && - (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) { - *stat = BULKSTAT_RV_NOTHING; + if (!buffer || xfs_internal_inum(mp, ino)) return XFS_ERROR(EINVAL); - } - if (ubsize < sizeof(*buf)) { - *stat = BULKSTAT_RV_NOTHING; + if (ubsize < sizeof(*buf)) return XFS_ERROR(ENOMEM); - } buf = kmem_alloc(sizeof(*buf), KM_SLEEP); @@ -237,22 +244,71 @@ xfs_bulkstat_one( xfs_bulkstat_one_dinode(mp, ino, dip, buf); } - if (copy_to_user(buffer, buf, sizeof(*buf))) { - *stat = BULKSTAT_RV_NOTHING; - error = EFAULT; + error = formatter(buffer, buf); + if (error < 0) { + error = EFAULT; goto out_free; } *stat = BULKSTAT_RV_DIDONE; if (ubused) - *ubused = sizeof(*buf); + *ubused = error; out_free: - kmem_free(buf, sizeof(*buf)); + kmem_free(buf); return error; } /* + * Test to see whether we can use the ondisk inode directly, based + * on the given bulkstat flags, filling in dipp accordingly. + * Returns zero if the inode is dodgey. + */ +STATIC int +xfs_bulkstat_use_dinode( + xfs_mount_t *mp, + int flags, + xfs_buf_t *bp, + int clustidx, + xfs_dinode_t **dipp) +{ + xfs_dinode_t *dip; + unsigned int aformat; + + *dipp = NULL; + if (!bp || (flags & BULKSTAT_FG_IGET)) + return 1; + dip = (xfs_dinode_t *) + xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog); + /* + * Check the buffer containing the on-disk inode for di_mode == 0. + * This is to prevent xfs_bulkstat from picking up just reclaimed + * inodes that have their in-core state initialized but not flushed + * to disk yet. This is a temporary hack that would require a proper + * fix in the future. + */ + if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC || + !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) || + !dip->di_core.di_mode) + return 0; + if (flags & BULKSTAT_FG_QUICK) { + *dipp = dip; + return 1; + } + /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */ + aformat = dip->di_core.di_aformat; + if ((XFS_DFORK_Q(dip) == 0) || + (aformat == XFS_DINODE_FMT_LOCAL) || + (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) { + *dipp = dip; + return 1; + } + return 1; +} + +#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) + +/* * Return stat information in bulk (by-inode) for the filesystem. */ int /* error status */ @@ -284,11 +340,12 @@ xfs_bulkstat( xfs_agino_t gino; /* current btree rec's start inode */ int i; /* loop index */ int icount; /* count of inodes good in irbuf */ + size_t irbsize; /* size of irec buffer in bytes */ xfs_ino_t ino; /* inode number (filesystem) */ - xfs_inobt_rec_t *irbp; /* current irec buffer pointer */ - xfs_inobt_rec_t *irbuf; /* start of irec buffer */ - xfs_inobt_rec_t *irbufend; /* end of good irec buffer entries */ - xfs_ino_t lastino=0; /* last inode number returned */ + xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ + xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ + xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ + xfs_ino_t lastino; /* last inode number returned */ int nbcluster; /* # of blocks in a cluster */ int nicluster; /* # of inodes in a cluster */ int nimask; /* mask for inode clusters */ @@ -308,6 +365,7 @@ xfs_bulkstat( * Get the last inode value, see if there's nothing to do. */ ino = (xfs_ino_t)*lastinop; + lastino = ino; dip = NULL; agno = XFS_INO_TO_AGNO(mp, ino); agino = XFS_INO_TO_AGINO(mp, ino); @@ -317,6 +375,9 @@ xfs_bulkstat( *ubcountp = 0; return 0; } + if (!ubcountp || *ubcountp <= 0) { + return EINVAL; + } ubcount = *ubcountp; /* statstruct's */ ubleft = ubcount * statstruct_size; /* bytes */ *ubcountp = ubelem = 0; @@ -328,19 +389,17 @@ xfs_bulkstat( (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); nimask = ~(nicluster - 1); nbcluster = nicluster >> mp->m_sb.sb_inopblog; - /* - * Allocate a page-sized buffer for inode btree records. - * We could try allocating something smaller, but for normal - * calls we'll always (potentially) need the whole page. - */ - irbuf = kmem_alloc(NBPC, KM_SLEEP); - nirbuf = NBPC / sizeof(*irbuf); + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4, + KM_SLEEP | KM_MAYFAIL | KM_LARGE); + nirbuf = irbsize / sizeof(*irbuf); + /* * Loop over the allocation groups, starting from the last * inode returned; 0 means start of the allocation group. */ rval = 0; - while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) { + while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { + cond_resched(); bp = NULL; down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); @@ -358,7 +417,7 @@ xfs_bulkstat( * Allocate and initialize a btree cursor for ialloc btree. */ cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); + (xfs_inode_t *)0, 0); irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; @@ -395,9 +454,9 @@ xfs_bulkstat( gcnt++; } gfree |= XFS_INOBT_MASKN(0, chunkidx); - INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); - INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); - INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + irbp->ir_startino = gino; + irbp->ir_freecount = gcnt; + irbp->ir_free = gfree; irbp++; agino = gino + XFS_INODES_PER_CHUNK; icount = XFS_INODES_PER_CHUNK - gcnt; @@ -437,6 +496,7 @@ xfs_bulkstat( break; error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); + cond_resched(); } /* * If ran off the end of the ag either with an error, @@ -451,11 +511,27 @@ xfs_bulkstat( } /* * If this chunk has any allocated inodes, save it. + * Also start read-ahead now for this chunk. */ if (gcnt < XFS_INODES_PER_CHUNK) { - INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); - INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); - INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + /* + * Loop over all clusters in the next chunk. + * Do a readahead if there are any allocated + * inodes in that cluster. + */ + for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), + chunkidx = 0; + chunkidx < XFS_INODES_PER_CHUNK; + chunkidx += nicluster, + agbno += nbcluster) { + if (XFS_INOBT_MASKN(chunkidx, + nicluster) & ~gfree) + xfs_btree_reada_bufs(mp, agno, + agbno, nbcluster); + } + irbp->ir_startino = gino; + irbp->ir_freecount = gcnt; + irbp->ir_free = gfree; irbp++; icount += XFS_INODES_PER_CHUNK - gcnt; } @@ -464,6 +540,7 @@ xfs_bulkstat( */ agino = gino + XFS_INODES_PER_CHUNK; error = xfs_inobt_increment(cur, 0, &tmp); + cond_resched(); } /* * Drop the btree buffers and the agi buffer. @@ -477,35 +554,13 @@ xfs_bulkstat( */ irbufend = irbp; for (irbp = irbuf; - irbp < irbufend && ubleft >= statstruct_size; irbp++) { - /* - * Read-ahead the next chunk's worth of inodes. - */ - if (&irbp[1] < irbufend) { - /* - * Loop over all clusters in the next chunk. - * Do a readahead if there are any allocated - * inodes in that cluster. - */ - for (agbno = XFS_AGINO_TO_AGBNO(mp, - INT_GET(irbp[1].ir_startino, ARCH_CONVERT)), - chunkidx = 0; - chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += nicluster, - agbno += nbcluster) { - if (XFS_INOBT_MASKN(chunkidx, - nicluster) & - ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT))) - xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); - } - } + irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { /* * Now process this chunk of inodes. */ - for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0; - ubleft > 0 && - INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK; + for (agino = irbp->ir_startino, chunkidx = clustidx = 0; + XFS_BULKSTAT_UBLEFT(ubleft) && + irbp->ir_freecount < XFS_INODES_PER_CHUNK; chunkidx++, clustidx++, agino++) { ASSERT(chunkidx < XFS_INODES_PER_CHUNK); /* @@ -525,11 +580,12 @@ xfs_bulkstat( */ if ((chunkidx & (nicluster - 1)) == 0) { agbno = XFS_AGINO_TO_AGBNO(mp, - INT_GET(irbp->ir_startino, ARCH_CONVERT)) + + irbp->ir_startino) + ((chunkidx & nimask) >> mp->m_sb.sb_inopblog); - if (flags & BULKSTAT_FG_QUICK) { + if (flags & (BULKSTAT_FG_QUICK | + BULKSTAT_FG_INLINE)) { ino = XFS_AGINO_TO_INO(mp, agno, agino); bno = XFS_AGB_TO_DADDR(mp, agno, @@ -543,11 +599,13 @@ xfs_bulkstat( KM_SLEEP); ip->i_ino = ino; ip->i_mount = mp; + spin_lock_init(&ip->i_flags_lock); if (bp) xfs_buf_relse(bp); error = xfs_itobp(mp, NULL, ip, &dip, &bp, bno, - XFS_IMAP_BULKSTAT); + XFS_IMAP_BULKSTAT, + XFS_BUF_LOCK); if (!error) clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; kmem_zone_free(xfs_inode_zone, ip); @@ -561,33 +619,41 @@ xfs_bulkstat( } } } + ino = XFS_AGINO_TO_INO(mp, agno, agino); + bno = XFS_AGB_TO_DADDR(mp, agno, agbno); /* * Skip if this inode is free. */ - if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT)) + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { + lastino = ino; continue; + } /* * Count used inodes as free so we can tell * when the chunk is used up. */ - INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1); - ino = XFS_AGINO_TO_INO(mp, agno, agino); - bno = XFS_AGB_TO_DADDR(mp, agno, agbno); - if (flags & BULKSTAT_FG_QUICK) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (clustidx << mp->m_sb.sb_inodelog)); - - if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) - != XFS_DINODE_MAGIC - || !XFS_DINODE_GOOD_VERSION( - INT_GET(dip->di_core.di_version, ARCH_CONVERT))) - continue; + irbp->ir_freecount++; + if (!xfs_bulkstat_use_dinode(mp, flags, bp, + clustidx, &dip)) { + lastino = ino; + continue; + } + /* + * If we need to do an iget, cannot hold bp. + * Drop it, until starting the next cluster. + */ + if ((flags & BULKSTAT_FG_INLINE) && !dip) { + if (bp) + xfs_buf_relse(bp); + bp = NULL; } /* * Get the inode and fill in a single buffer. * BULKSTAT_FG_QUICK uses dip to fill it in. * BULKSTAT_FG_IGET uses igets. + * BULKSTAT_FG_INLINE uses dip if we have an + * inline attr fork, else igets. * See: xfs_bulkstat_one & xfs_dm_bulkstat_one. * This is also used to count inodes/blks, etc * in xfs_qm_quotacheck. @@ -597,8 +663,13 @@ xfs_bulkstat( ubleft, private_data, bno, &ubused, dip, &fmterror); if (fmterror == BULKSTAT_RV_NOTHING) { - if (error == ENOMEM) + if (error && error != ENOENT && + error != EINVAL) { ubleft = 0; + rval = error; + break; + } + lastino = ino; continue; } if (fmterror == BULKSTAT_RV_GIVEUP) { @@ -613,6 +684,8 @@ xfs_bulkstat( ubelem++; lastino = ino; } + + cond_resched(); } if (bp) @@ -621,7 +694,7 @@ xfs_bulkstat( /* * Set up for the next loop iteration. */ - if (ubleft > 0) { + if (XFS_BULKSTAT_UBLEFT(ubleft)) { if (end_of_ag) { agno++; agino = 0; @@ -633,8 +706,13 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free(irbuf, NBPC); + kmem_free(irbuf); *ubcountp = ubelem; + /* + * Found some inodes, return them now and return the error next time. + */ + if (ubelem) + rval = 0; if (agno >= mp->m_sb.sb_agcount) { /* * If we ran out of filesystem, mark lastino as off @@ -698,6 +776,19 @@ xfs_bulkstat_single( return 0; } +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written) /* # of bytes written */ +{ + if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer))) + return -EFAULT; + *written = count * sizeof(*buffer); + return 0; +} + /* * Return inode number table for the filesystem. */ @@ -706,7 +797,8 @@ xfs_inumbers( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t *lastino, /* last inode returned */ int *count, /* size of buffer/count returned */ - xfs_inogrp_t __user *ubuffer)/* buffer with inode descriptions */ + void __user *ubuffer,/* buffer with inode descriptions */ + inumbers_fmt_pf formatter) { xfs_buf_t *agbp; xfs_agino_t agino; @@ -729,7 +821,7 @@ xfs_inumbers( agino = XFS_INO_TO_AGINO(mp, ino); left = *count; *count = 0; - bcount = MIN(left, (int)(NBPP / sizeof(*buffer))); + bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); error = bufidx = 0; cur = NULL; @@ -759,7 +851,7 @@ xfs_inumbers( xfs_buf_relse(agbp); agbp = NULL; /* - * Move up the the last inode in the current + * Move up the last inode in the current * chunk. The lookup_ge will always get * us the first inode in the next chunk. */ @@ -785,12 +877,12 @@ xfs_inumbers( bufidx++; left--; if (bufidx == bcount) { - if (copy_to_user(ubuffer, buffer, - bufidx * sizeof(*buffer))) { + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) { error = XFS_ERROR(EFAULT); break; } - ubuffer += bufidx; + ubuffer += written; *count += bufidx; bufidx = 0; } @@ -812,15 +904,15 @@ xfs_inumbers( } if (!error) { if (bufidx) { - if (copy_to_user(ubuffer, buffer, - bufidx * sizeof(*buffer))) + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) error = XFS_ERROR(EFAULT); else *count += bufidx; } *lastino = XFS_AGINO_TO_INO(mp, agno, agino); } - kmem_free(buffer, bcount * sizeof(*buffer)); + kmem_free(buffer); if (cur) xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR)); diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index be5f12e..a1f18fc 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -36,15 +36,16 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, /* * Values for stat return value. */ -#define BULKSTAT_RV_NOTHING 0 -#define BULKSTAT_RV_DIDONE 1 -#define BULKSTAT_RV_GIVEUP 2 +#define BULKSTAT_RV_NOTHING 0 +#define BULKSTAT_RV_DIDONE 1 +#define BULKSTAT_RV_GIVEUP 2 /* * Values for bulkstat flag argument. */ -#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ -#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ +#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ +#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ +#define BULKSTAT_FG_INLINE 0x4 /* No iget if inline attrs */ /* * Return stat information in bulk (by-inode) for the filesystem. @@ -68,6 +69,10 @@ xfs_bulkstat_single( char __user *buffer, int *done); +typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ + void __user *ubuffer, /* buffer to write to */ + const xfs_bstat_t *buffer); /* buffer to read from */ + int xfs_bulkstat_one( xfs_mount_t *mp, @@ -80,11 +85,30 @@ xfs_bulkstat_one( void *dibuff, int *stat); +int +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino); + +typedef int (*inumbers_fmt_pf)( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + int /* error status */ xfs_inumbers( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t *last, /* last inode returned */ int *count, /* size of buffer/count returned */ - xfs_inogrp_t __user *buffer);/* buffer with inode info */ + void __user *buffer, /* buffer with inode info */ + inumbers_fmt_pf formatter); #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 21ac1a6..3608a0f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -41,6 +41,7 @@ #include "xfs_inode.h" #include "xfs_rw.h" +kmem_zone_t *xfs_log_ticket_zone; #define xlog_write_adv_cnt(ptr, len, off, bytes) \ { (ptr) += (bytes); \ @@ -73,8 +74,6 @@ STATIC int xlog_state_get_iclog_space(xlog_t *log, xlog_ticket_t *ticket, int *continued_write, int *logoffsetp); -STATIC void xlog_state_put_ticket(xlog_t *log, - xlog_ticket_t *tic); STATIC int xlog_state_release_iclog(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_state_switch_iclogs(xlog_t *log, @@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, /* local ticket functions */ -STATIC void xlog_state_ticket_alloc(xlog_t *log); STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, int unit_bytes, int count, @@ -126,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE 2048 +#define XLOG_TRACE_ICLOG_SIZE 256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ + ktrace_free(log->l_grant_trace); +} + void xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) { unsigned long cnts; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } /* ticket counts are 1 byte each */ cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -159,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) } void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ + ktrace_free(iclog->ic_trace); +} + +void xlog_trace_iclog(xlog_in_core_t *iclog, uint state) { - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_SLEEP); ktrace_enter(iclog->ic_trace, (void *)((unsigned long)state), (void *)((unsigned long)current_pid()), @@ -172,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) (void *)NULL, (void *)NULL); } #else + +#define xlog_trace_loggrant_alloc(log) +#define xlog_trace_loggrant_dealloc(log) #define xlog_trace_loggrant(log,tic,string) + +#define xlog_trace_iclog_alloc(iclog) +#define xlog_trace_iclog_dealloc(iclog) #define xlog_trace_iclog(iclog,state) + #endif /* XFS_LOG_TRACE */ @@ -228,20 +254,24 @@ xlog_grant_sub_space(struct log *log, int bytes) static void xlog_grant_add_space_write(struct log *log, int bytes) { - log->l_grant_write_bytes += bytes; - if (log->l_grant_write_bytes > log->l_logsize) { - log->l_grant_write_bytes -= log->l_logsize; + int tmp = log->l_logsize - log->l_grant_write_bytes; + if (tmp > bytes) + log->l_grant_write_bytes += bytes; + else { log->l_grant_write_cycle++; + log->l_grant_write_bytes = bytes - tmp; } } static void xlog_grant_add_space_reserve(struct log *log, int bytes) { - log->l_grant_reserve_bytes += bytes; - if (log->l_grant_reserve_bytes > log->l_logsize) { - log->l_grant_reserve_bytes -= log->l_logsize; + int tmp = log->l_logsize - log->l_grant_reserve_bytes; + if (tmp > bytes) + log->l_grant_reserve_bytes += bytes; + else { log->l_grant_reserve_cycle++; + log->l_grant_reserve_bytes = bytes - tmp; } } @@ -252,6 +282,29 @@ xlog_grant_add_space(struct log *log, int bytes) xlog_grant_add_space_reserve(log, bytes); } +static void +xlog_tic_reset_res(xlog_ticket_t *tic) +{ + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + tic->t_res_num_ophdrs = 0; +} + +static void +xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) +{ + if (tic->t_res_num == XLOG_TIC_LEN_MAX) { + /* add to overflow and start again */ + tic->t_res_o_flow += tic->t_res_arr_sum; + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + } + + tic->t_res_arr[tic->t_res_num].r_len = len; + tic->t_res_arr[tic->t_res_num].r_type = type; + tic->t_res_arr_sum += len; + tic->t_res_num++; +} /* * NOTES: @@ -307,19 +360,16 @@ xfs_log_done(xfs_mount_t *mp, */ xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); xlog_ungrant_log_space(log, ticket); - xlog_state_put_ticket(log, ticket); + xlog_ticket_put(log, ticket); } else { xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); xlog_regrant_reserve_log_space(log, ticket); - } - - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) && - (flags & XFS_LOG_REL_PERM_RESERV) == 0) + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ ticket->t_flags |= XLOG_TIC_INITED; + } return lsn; } /* xfs_log_done */ @@ -332,11 +382,11 @@ xfs_log_done(xfs_mount_t *mp, * Asynchronous forces are implemented by setting the WANT_SYNC * bit in the appropriate in-core log and then returning. * - * Synchronous forces are implemented with a semaphore. All callers - * to force a given lsn to disk will wait on a semaphore attached to the + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the * specific in-core log. When given in-core log finally completes its * write to disk, that thread will wake up all threads waiting on the - * semaphore. + * sv. */ int _xfs_log_force( @@ -361,7 +411,27 @@ _xfs_log_force( return xlog_state_sync_all(log, flags, log_flushed); else return xlog_state_sync(log, lsn, flags, log_flushed); -} /* xfs_log_force */ +} /* _xfs_log_force */ + +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + error = _xfs_log_force(mp, lsn, flags, NULL); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " + "error %d returned.", error); + } +} + /* * Attaches a new iclog I/O completion callback routine during @@ -374,12 +444,10 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ void *iclog_hndl, /* iclog to hang callback off */ xfs_log_callback_t *cb) { - xlog_t *log = mp->m_log; xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; - int abortflg, spl; + int abortflg; - cb->cb_next = NULL; - spl = LOG_LOCK(log); + spin_lock(&iclog->ic_callback_lock); abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); if (!abortflg) { ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || @@ -388,7 +456,7 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ *(iclog->ic_callback_tail) = cb; iclog->ic_callback_tail = &(cb->cb_next); } - LOG_UNLOCK(log, spl); + spin_unlock(&iclog->ic_callback_lock); return abortflg; } /* xfs_log_notify */ @@ -448,6 +516,8 @@ xfs_log_reserve(xfs_mount_t *mp, /* may sleep if need to allocate more tickets */ internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, client, flags); + if (!internal_ticket) + return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; *ticket = internal_ticket; xlog_trace_loggrant(log, internal_ticket, @@ -475,41 +545,57 @@ xfs_log_reserve(xfs_mount_t *mp, * Return error or zero. */ int -xfs_log_mount(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) +xfs_log_mount( + xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { + int error; + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); else { cmn_err(CE_NOTE, "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", mp->m_fsname); - ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (!mp->m_log) { + cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!"); + error = ENOMEM; + goto out; + } + + /* + * Initialize the AIL now we have a log. + */ + spin_lock_init(&mp->m_ail_lock); + error = xfs_trans_ail_init(mp); + if (error) { + cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); + goto error; + } /* * skip log recovery on a norecovery mount. pretend it all * just worked. */ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - bhv_vfs_t *vfsp = XFS_MTOVFS(mp); - int error, readonly = (vfsp->vfs_flag & VFS_RDONLY); + int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); if (readonly) - vfsp->vfs_flag &= ~VFS_RDONLY; + mp->m_flags &= ~XFS_MOUNT_RDONLY; error = xlog_recover(mp->m_log); if (readonly) - vfsp->vfs_flag |= VFS_RDONLY; + mp->m_flags |= XFS_MOUNT_RDONLY; if (error) { cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); - xlog_dealloc_log(mp->m_log); - return error; + goto error; } } @@ -518,6 +604,10 @@ xfs_log_mount(xfs_mount_t *mp, /* End mounting message in xfs_log_mount_finish */ return 0; +error: + xfs_log_unmount_dealloc(mp); +out: + return error; } /* xfs_log_mount */ /* @@ -529,15 +619,15 @@ xfs_log_mount(xfs_mount_t *mp, * mp - ubiquitous xfs mount point structure */ int -xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags) +xfs_log_mount_finish(xfs_mount_t *mp) { int error; if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - error = xlog_recover_finish(mp->m_log, mfsi_flags); + error = xlog_recover_finish(mp->m_log); else { error = 0; - ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } return error; @@ -584,7 +674,6 @@ xfs_log_unmount_write(xfs_mount_t *mp) xfs_log_ticket_t tic = NULL; xfs_lsn_t lsn; int error; - SPLDECL(s); /* the data section must be 32 bit size aligned */ struct { @@ -597,10 +686,11 @@ xfs_log_unmount_write(xfs_mount_t *mp) * Don't write out unmount record on read-only mounts. * Or, if we are doing a forced umount (typically because of IO errors). */ - if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; - xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC); + error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); + ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG first_iclog = iclog = log->l_iclog; @@ -617,7 +707,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) reg[0].i_len = sizeof(magic); XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_UNMOUNT); - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, + XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); if (!error) { /* remove inited flag */ ((xlog_ticket_t *)tic)->t_flags = 0; @@ -636,27 +727,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) } - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; - LOG_UNLOCK(log, s); + atomic_inc(&iclog->ic_refcnt); + spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if (!(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY)) { if (!XLOG_FORCED_SHUTDOWN(log)) { - sv_wait(&iclog->ic_forcesema, PMEM, + sv_wait(&iclog->ic_force_wait, PMEM, &log->l_icloglock, s); } else { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } } else { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); + } + if (tic) { + xlog_trace_loggrant(log, tic, "unmount rec"); + xlog_ungrant_log_space(log, tic); + xlog_ticket_put(log, tic); } - if (tic) - xlog_state_put_ticket(log, tic); } else { /* * We're already in forced_shutdown mode, couldn't @@ -671,36 +765,40 @@ xfs_log_unmount_write(xfs_mount_t *mp) * a file system that went into forced_shutdown as * the result of an unmount.. */ - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; - LOG_UNLOCK(log, s); + atomic_inc(&iclog->ic_refcnt); + spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY || iclog->ic_state == XLOG_STATE_IOERROR) ) { - sv_wait(&iclog->ic_forcesema, PMEM, + sv_wait(&iclog->ic_force_wait, PMEM, &log->l_icloglock, s); } else { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } } - return 0; + return error; } /* xfs_log_unmount_write */ /* * Deallocate log structures for unmount/relocation. + * + * We need to stop the aild from running before we destroy + * and deallocate the log as the aild references the log. */ void xfs_log_unmount_dealloc(xfs_mount_t *mp) { + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -736,20 +834,18 @@ xfs_log_move_tail(xfs_mount_t *mp, xlog_ticket_t *tic; xlog_t *log = mp->m_log; int need_bytes, free_bytes, cycle, bytes; - SPLDECL(s); if (XLOG_FORCED_SHUTDOWN(log)) return; - ASSERT(!XFS_FORCED_SHUTDOWN(mp)); if (tail_lsn == 0) { /* needed since sync_lsn is 64 bits */ - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); tail_lsn = log->l_last_sync_lsn; - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); /* Also an invalid lsn. 1 implies that we aren't passing in a valid * tail_lsn. @@ -773,7 +869,7 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= tic->t_unit_res; - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_write_headq); } @@ -794,11 +890,11 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= need_bytes; - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_reserve_headq); } - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); } /* xfs_log_move_tail */ /* @@ -810,16 +906,13 @@ xfs_log_move_tail(xfs_mount_t *mp, int xfs_log_need_covered(xfs_mount_t *mp) { - SPLDECL(s); int needed = 0, gen; xlog_t *log = mp->m_log; - bhv_vfs_t *vfsp = XFS_MTOVFS(mp); - if (vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) || - (vfsp->vfs_flag & VFS_RDONLY)) + if (!xfs_fs_writable(mp)) return 0; - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || (log->l_covered_state == XLOG_STATE_COVER_NEED2)) && !xfs_trans_first_ail(mp, &gen) @@ -832,7 +925,7 @@ xfs_log_need_covered(xfs_mount_t *mp) } needed = 1; } - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return needed; } @@ -857,17 +950,16 @@ xfs_lsn_t xlog_assign_tail_lsn(xfs_mount_t *mp) { xfs_lsn_t tail_lsn; - SPLDECL(s); xlog_t *log = mp->m_log; tail_lsn = xfs_trans_tail_ail(mp); - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); if (tail_lsn != 0) { log->l_tail_lsn = tail_lsn; } else { tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn; } - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); return tail_lsn; } /* xlog_assign_tail_lsn */ @@ -887,7 +979,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp) * the tail. The details of this case are described below, but the end * result is that we return the size of the log as the amount of space left. */ -int +STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes) { int free_bytes; @@ -947,6 +1039,20 @@ xlog_iodone(xfs_buf_t *bp) l = iclog->ic_log; /* + * If the _XFS_BARRIER_FAILED flag was set by a lower + * layer, it means the underlying device no longer supports + * barrier I/O. Warn loudly and turn off barriers. + */ + if (bp->b_flags & _XFS_BARRIER_FAILED) { + bp->b_flags &= ~_XFS_BARRIER_FAILED; + l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; + xfs_fs_cmn_err(CE_WARN, l->l_mp, + "xlog_iodone: Barriers are no longer supported" + " by device. Disabling barriers\n"); + xfs_buftrace("XLOG_IODONE BARRIERS OFF", bp); + } + + /* * Race to shutdown the filesystem if we see an error. */ if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, @@ -963,14 +1069,16 @@ xlog_iodone(xfs_buf_t *bp) } else if (iclog->ic_state & XLOG_STATE_IOERROR) { aborted = XFS_LI_ABORTED; } + + /* log I/O is always issued ASYNC */ + ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); - if (!(XFS_BUF_ISASYNC(bp))) { - /* - * Corresponding psema() will be done in bwrite(). If we don't - * vsema() here, panic. - */ - XFS_BUF_V_IODONESEMA(bp); - } + /* + * do not reference the buffer (bp) here as we could race + * with it being freed after writing the unmount record to the + * log. + */ + } /* xlog_iodone */ /* @@ -1008,10 +1116,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp) /* * Return size of each in-core log record buffer. * - * Low memory machines only get 2 16KB buffers. We don't want to waste - * memory here. However, all other machines get at least 2 32KB buffers. - * The number is hard coded because we don't care about the minimum - * memory size, just 32MB systems. + * All machines get 8 x 32KB buffers by default, unless tuned otherwise. * * If the filesystem blocksize is too large, we may need to choose a * larger size since the directory code currently logs entire blocks. @@ -1024,17 +1129,10 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, int size; int xhdrs; - if (mp->m_logbufs <= 0) { - if (xfs_physmem <= btoc(128*1024*1024)) { - log->l_iclog_bufs = XLOG_MIN_ICLOGS; - } else if (xfs_physmem <= btoc(400*1024*1024)) { - log->l_iclog_bufs = XLOG_MED_ICLOGS; - } else { /* 256K with 32K bufs */ - log->l_iclog_bufs = XLOG_MAX_ICLOGS; - } - } else { + if (mp->m_logbufs <= 0) + log->l_iclog_bufs = XLOG_MAX_ICLOGS; + else log->l_iclog_bufs = mp->m_logbufs; - } /* * Buffer size passed in from mount system call. @@ -1047,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, size >>= 1; } - if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { + if (xfs_sb_version_haslogv2(&mp->m_sb)) { /* # headers = size / 32K * one header holds cycles from 32K of data */ @@ -1065,18 +1163,9 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, goto done; } - /* - * Special case machines that have less than 32MB of memory. - * All machines with more memory use 32KB buffers. - */ - if (xfs_physmem <= btoc(32*1024*1024)) { - /* Don't change; min configuration */ - log->l_iclog_size = XLOG_RECORD_BSIZE; /* 16k */ - log->l_iclog_size_log = XLOG_RECORD_BSHIFT; - } else { - log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; /* 32k */ - log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; - } + /* All machines use 32KB buffers by default. */ + log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; + log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; /* the default log size is 16k or 32k which is one header sector */ log->l_iclog_hsize = BBSIZE; @@ -1134,7 +1223,9 @@ xlog_alloc_log(xfs_mount_t *mp, int i; int iclogsize; - log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP); + log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); + if (!log) + return NULL; log->l_mp = mp; log->l_targ = log_target; @@ -1145,20 +1236,20 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_flags |= XLOG_ACTIVE_RECOVERY; log->l_prev_block = -1; - ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0); + log->l_tail_lsn = xlog_assign_lsn(1, 0); /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ log->l_last_sync_lsn = log->l_tail_lsn; log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ log->l_grant_reserve_cycle = 1; log->l_grant_write_cycle = 1; - if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) { + if (xfs_sb_version_hassector(&mp->m_sb)) { log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; ASSERT(log->l_sectbb_log <= mp->m_sectbb_log); /* for larger sector sizes, must have v2 or external log */ ASSERT(log->l_sectbb_log == 0 || log->l_logBBstart == 0 || - XFS_SB_VERSION_HASLOGV2(&mp->m_sb)); + xfs_sb_version_haslogv2(&mp->m_sb)); ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT); } log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; @@ -1166,6 +1257,8 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + if (!bp) + goto out_free_log; XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); @@ -1173,11 +1266,11 @@ xlog_alloc_log(xfs_mount_t *mp, ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); log->l_xbuf = bp; - spinlock_init(&log->l_icloglock, "iclog"); - spinlock_init(&log->l_grant_lock, "grhead_iclog"); - initnsema(&log->l_flushsema, 0, "ic-flush"); - xlog_state_ticket_alloc(log); /* wait until after icloglock inited */ + spin_lock_init(&log->l_icloglock); + spin_lock_init(&log->l_grant_lock); + sv_init(&log->l_flush_wait, 0, "flush_wait"); + xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1192,42 +1285,51 @@ xlog_alloc_log(xfs_mount_t *mp, iclogsize = log->l_iclog_size; ASSERT(log->l_iclog_size >= 4096); for (i=0; i < log->l_iclog_bufs; i++) { - *iclogp = (xlog_in_core_t *) - kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); - iclog = *iclogp; - iclog->hic_data = (xlog_in_core_2_t *) - kmem_zalloc(iclogsize, KM_SLEEP); + *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); + if (!*iclogp) + goto out_free_iclog; + iclog = *iclogp; iclog->ic_prev = prev_iclog; prev_iclog = iclog; - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); - - head = &iclog->ic_header; - memset(head, 0, sizeof(xlog_rec_header_t)); - INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM); - INT_SET(head->h_version, ARCH_CONVERT, - XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1); - INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size); - /* new fields */ - INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT); - memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); + if (!bp) + goto out_free_iclog; + if (!XFS_BUF_CPSEMA(bp)) + ASSERT(0); XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); iclog->ic_bp = bp; + iclog->hic_data = bp->b_addr; +#ifdef DEBUG + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); +#endif + head = &iclog->ic_header; + memset(head, 0, sizeof(xlog_rec_header_t)); + head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + head->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + head->h_size = cpu_to_be32(log->l_iclog_size); + /* new fields */ + head->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; + atomic_set(&iclog->ic_refcnt, 0); + spin_lock_init(&iclog->ic_callback_lock); iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); - sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force"); - sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write"); + sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); + sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + + xlog_trace_iclog_alloc(iclog); iclogp = &iclog->ic_next; } @@ -1235,6 +1337,25 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ return log; + +out_free_iclog: + for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { + prev_iclog = iclog->ic_next; + if (iclog->ic_bp) { + sv_destroy(&iclog->ic_force_wait); + sv_destroy(&iclog->ic_write_wait); + xfs_buf_free(iclog->ic_bp); + xlog_trace_iclog_dealloc(iclog); + } + kmem_free(iclog); + } + spinlock_destroy(&log->l_icloglock); + spinlock_destroy(&log->l_grant_lock); + xlog_trace_loggrant_dealloc(log); + xfs_buf_free(log->l_xbuf); +out_free_log: + kmem_free(log); + return NULL; } /* xlog_alloc_log */ @@ -1271,7 +1392,7 @@ xlog_commit_record(xfs_mount_t *mp, * pushes on an lsn which is further along in the log once we reach the high * water mark. In this manner, we would be creating a low water mark. */ -void +STATIC void xlog_grant_push_ail(xfs_mount_t *mp, int need_bytes) { @@ -1283,11 +1404,10 @@ xlog_grant_push_ail(xfs_mount_t *mp, int threshold_block; /* block in lsn we'd like to be at */ int threshold_cycle; /* lsn cycle we'd like to be at */ int free_threshold; - SPLDECL(s); ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, log->l_grant_reserve_bytes); @@ -1309,8 +1429,7 @@ xlog_grant_push_ail(xfs_mount_t *mp, threshold_block -= log->l_logBBsize; threshold_cycle += 1; } - ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle, - threshold_block); + threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); /* Don't pass in an lsn greater than the lsn of the last * log record known to be on disk. @@ -1318,7 +1437,7 @@ xlog_grant_push_ail(xfs_mount_t *mp, if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) threshold_lsn = log->l_last_sync_lsn; } - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); /* * Get the transaction layer to kick the dirty buffers out to @@ -1356,23 +1475,22 @@ xlog_grant_push_ail(xfs_mount_t *mp, * is added immediately before calling bwrite(). */ -int +STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog) { xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; - int i, ops; + int i; uint count; /* byte count of bwrite */ uint count_init; /* initial count before roundup */ int roundoff; /* roundoff to BB or stripe */ int split = 0; /* split write into two regions */ int error; - SPLDECL(s); - int v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb); + int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); XFS_STATS_INC(xs_log_writes); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; @@ -1393,30 +1511,26 @@ xlog_sync(xlog_t *log, roundoff < BBTOB(1))); /* move grant heads by roundoff in sync */ - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); xlog_grant_add_space(log, roundoff); - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); /* put cycle number in every block */ xlog_pack_data(log, iclog, roundoff); /* real byte length */ if (v2) { - INT_SET(iclog->ic_header.h_len, - ARCH_CONVERT, - iclog->ic_offset + roundoff); + iclog->ic_header.h_len = + cpu_to_be32(iclog->ic_offset + roundoff); } else { - INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset); + iclog->ic_header.h_len = + cpu_to_be32(iclog->ic_offset); } - /* put ops count in correct order */ - ops = iclog->ic_header.h_num_logops; - INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); - bp = iclog->ic_bp; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); - XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); + XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); @@ -1428,7 +1542,7 @@ xlog_sync(xlog_t *log, } else { iclog->ic_bwritecnt = 1; } - XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); + XFS_BUF_SET_COUNT(bp, count); XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); @@ -1479,10 +1593,10 @@ xlog_sync(xlog_t *log, * a new cycle. Watch out for the header magic number * case, though. */ - for (i=0; i<split; i += BBSIZE) { - INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1); - if (INT_GET(*(uint *)dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) - INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1); + for (i = 0; i < split; i += BBSIZE) { + be32_add_cpu((__be32 *)dptr, 1); + if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) + be32_add_cpu((__be32 *)dptr, 1); dptr += BBSIZE; } @@ -1505,60 +1619,29 @@ xlog_sync(xlog_t *log, /* * Deallocate a log structure */ -void +STATIC void xlog_dealloc_log(xlog_t *log) { xlog_in_core_t *iclog, *next_iclog; - xlog_ticket_t *tic, *next_tic; int i; - iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { - sv_destroy(&iclog->ic_forcesema); - sv_destroy(&iclog->ic_writesema); + sv_destroy(&iclog->ic_force_wait); + sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif + xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; - kmem_free(iclog->hic_data, log->l_iclog_size); - kmem_free(iclog, sizeof(xlog_in_core_t)); + kmem_free(iclog); iclog = next_iclog; } - freesema(&log->l_flushsema); spinlock_destroy(&log->l_icloglock); spinlock_destroy(&log->l_grant_lock); - /* XXXsup take a look at this again. */ - if ((log->l_ticket_cnt != log->l_ticket_tcnt) && - !XLOG_FORCED_SHUTDOWN(log)) { - xfs_fs_cmn_err(CE_WARN, log->l_mp, - "xlog_dealloc_log: (cnt: %d, total: %d)", - log->l_ticket_cnt, log->l_ticket_tcnt); - /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */ - - } else { - tic = log->l_unmount_free; - while (tic) { - next_tic = tic->t_next; - kmem_free(tic, NBPP); - tic = next_tic; - } - } xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif + xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; - kmem_free(log, sizeof(xlog_t)); + kmem_free(log); } /* xlog_dealloc_log */ /* @@ -1571,14 +1654,12 @@ xlog_state_finish_copy(xlog_t *log, int record_cnt, int copy_bytes) { - SPLDECL(s); - - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); - iclog->ic_header.h_num_logops += record_cnt; + be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } /* xlog_state_finish_copy */ @@ -1731,7 +1812,7 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) * we don't update ic_offset until the end when we know exactly how many * bytes have been written out. */ -int +STATIC int xlog_write(xfs_mount_t * mp, xfs_log_iovec_t reg[], int nentries, @@ -1766,14 +1847,14 @@ xlog_write(xfs_mount_t * mp, len = 0; if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ len += sizeof(xlog_op_header_t); - XLOG_TIC_ADD_OPHDR(ticket); + ticket->t_res_num_ophdrs++; } for (index = 0; index < nentries; index++) { len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ - XLOG_TIC_ADD_OPHDR(ticket); + ticket->t_res_num_ophdrs++; len += reg[index].i_len; - XLOG_TIC_ADD_REGION(ticket, reg[index].i_len, reg[index].i_type); + xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type); } contwr = *start_lsn = 0; @@ -1802,7 +1883,7 @@ xlog_write(xfs_mount_t * mp, /* start_lsn is the first lsn written to. That's all we need. */ if (! *start_lsn) - *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); /* This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). @@ -1818,7 +1899,7 @@ xlog_write(xfs_mount_t * mp, */ if (ticket->t_flags & XLOG_TIC_INITED) { logop_head = (xlog_op_header_t *)ptr; - INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); + logop_head->oh_tid = cpu_to_be32(ticket->t_tid); logop_head->oh_clientid = ticket->t_clientid; logop_head->oh_len = 0; logop_head->oh_flags = XLOG_START_TRANS; @@ -1832,7 +1913,7 @@ xlog_write(xfs_mount_t * mp, /* Copy log operation header directly into data section */ logop_head = (xlog_op_header_t *)ptr; - INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); + logop_head->oh_tid = cpu_to_be32(ticket->t_tid); logop_head->oh_clientid = ticket->t_clientid; logop_head->oh_res2 = 0; @@ -1867,13 +1948,14 @@ xlog_write(xfs_mount_t * mp, copy_off = partial_copy_len; if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ - INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy); + copy_len = need_copy; + logop_head->oh_len = cpu_to_be32(copy_len); if (partial_copy) logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); partial_copy_len = partial_copy = 0; } else { /* partial write */ copy_len = iclog->ic_size - log_offset; - INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len); + logop_head->oh_len = cpu_to_be32(copy_len); logop_head->oh_flags |= XLOG_CONTINUE_TRANS; if (partial_copy) logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; @@ -1882,7 +1964,7 @@ xlog_write(xfs_mount_t * mp, len += sizeof(xlog_op_header_t); /* from splitting of region */ /* account for new log op header */ ticket->t_curr_res -= sizeof(xlog_op_header_t); - XLOG_TIC_ADD_OPHDR(ticket); + ticket->t_res_num_ophdrs++; } xlog_verify_dest_ptr(log, ptr); @@ -1945,7 +2027,7 @@ xlog_write(xfs_mount_t * mp, /* Clean iclogs starting from the head. This ordering must be * maintained, so an iclog doesn't become ACTIVE beyond one that * is SYNCING. This is also required to maintain the notion that we use - * a counting semaphore to hold off would be writers to the log when every + * a ordered wait queue to hold off would be writers to the log when every * iclog is trying to sync to disk. * * State Change: DIRTY -> ACTIVE @@ -1961,7 +2043,7 @@ xlog_state_clean_log(xlog_t *log) if (iclog->ic_state == XLOG_STATE_DIRTY) { iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - iclog->ic_callback = NULL; /* don't need to free */ + ASSERT(iclog->ic_callback == NULL); /* * If the number of ops in this iclog indicate it just * contains the dummy transaction, we can @@ -1971,7 +2053,8 @@ xlog_state_clean_log(xlog_t *log) * We don't need to cover the dummy. */ if (!changed && - (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) { + (be32_to_cpu(iclog->ic_header.h_num_logops) == + XLOG_COVER_OPS)) { changed = 1; } else { /* @@ -2039,7 +2122,7 @@ xlog_get_lowest_lsn( lowest_lsn = 0; do { if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { - lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT); + lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); if ((lsn && !lowest_lsn) || (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { lowest_lsn = lsn; @@ -2068,9 +2151,9 @@ xlog_state_do_callback( int funcdidcallbacks; /* flag: function did callbacks */ int repeats; /* for issuing console warnings if * looping too many times */ - SPLDECL(s); + int wake = 0; - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); first_iclog = iclog = log->l_iclog; ioerrors = 0; funcdidcallbacks = 0; @@ -2115,7 +2198,7 @@ xlog_state_do_callback( * to DO_CALLBACK, we will not process it when * we retry since a previous iclog is in the * CALLBACK and the state cannot change since - * we are holding the LOG_LOCK. + * we are holding the l_icloglock. */ if (!(iclog->ic_state & (XLOG_STATE_DONE_SYNC | @@ -2141,11 +2224,9 @@ xlog_state_do_callback( */ lowest_lsn = xlog_get_lowest_lsn(log); - if (lowest_lsn && ( - XFS_LSN_CMP( - lowest_lsn, - INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) - )<0)) { + if (lowest_lsn && + XFS_LSN_CMP(lowest_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { iclog = iclog->ic_next; continue; /* Leave this iclog for * another thread */ @@ -2153,51 +2234,53 @@ xlog_state_do_callback( iclog->ic_state = XLOG_STATE_CALLBACK; - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); /* l_last_sync_lsn field protected by - * GRANT_LOCK. Don't worry about iclog's lsn. + * l_grant_lock. Don't worry about iclog's lsn. * No one else can be here except us. */ - s = GRANT_LOCK(log); - ASSERT(XFS_LSN_CMP( - log->l_last_sync_lsn, - INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) - )<=0); - log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); - GRANT_UNLOCK(log, s); + spin_lock(&log->l_grant_lock); + ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); + log->l_last_sync_lsn = + be64_to_cpu(iclog->ic_header.h_lsn); + spin_unlock(&log->l_grant_lock); - /* - * Keep processing entries in the callback list - * until we come around and it is empty. We - * need to atomically see that the list is - * empty and change the state to DIRTY so that - * we don't miss any more callbacks being added. - */ - s = LOG_LOCK(log); } else { + spin_unlock(&log->l_icloglock); ioerrors++; } - cb = iclog->ic_callback; - while (cb != 0) { + /* + * Keep processing entries in the callback list until + * we come around and it is empty. We need to + * atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more + * callbacks being added. + */ + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + while (cb) { iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_callback = NULL; - LOG_UNLOCK(log, s); + spin_unlock(&iclog->ic_callback_lock); /* perform callbacks in the order given */ - for (; cb != 0; cb = cb_next) { + for (; cb; cb = cb_next) { cb_next = cb->cb_next; cb->cb_func(cb->cb_arg, aborted); } - s = LOG_LOCK(log); + spin_lock(&iclog->ic_callback_lock); cb = iclog->ic_callback; } loopdidcallbacks++; funcdidcallbacks++; - ASSERT(iclog->ic_callback == 0); + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_callback == NULL); + spin_unlock(&iclog->ic_callback_lock); if (!(iclog->ic_state & XLOG_STATE_IOERROR)) iclog->ic_state = XLOG_STATE_DIRTY; @@ -2208,13 +2291,17 @@ xlog_state_do_callback( xlog_state_clean_log(log); /* wake up threads waiting in xfs_log_force() */ - sv_broadcast(&iclog->ic_forcesema); + sv_broadcast(&iclog->ic_force_wait); iclog = iclog->ic_next; } while (first_iclog != iclog); - if (repeats && (repeats % 10) == 0) { + + if (repeats > 5000) { + flushcnt += repeats; + repeats = 0; xfs_fs_cmn_err(CE_WARN, log->l_mp, - "xlog_state_do_callback: looping %d", repeats); + "%s: possible infinite loop (%d iterations)", + __func__, flushcnt); } } while (!ioerrors && loopdidcallbacks); @@ -2233,7 +2320,7 @@ xlog_state_do_callback( * * SYNCING - i/o completion will go through logs * DONE_SYNC - interrupt thread should be waiting for - * LOG_LOCK + * l_icloglock * IOERROR - give up hope all ye who enter here */ if (iclog->ic_state == XLOG_STATE_WANT_SYNC || @@ -2246,14 +2333,13 @@ xlog_state_do_callback( } #endif - if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) { - flushcnt = log->l_flushcnt; - log->l_flushcnt = 0; - } - LOG_UNLOCK(log, s); - while (flushcnt--) - vsema(&log->l_flushsema); -} /* xlog_state_do_callback */ + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + wake = 1; + spin_unlock(&log->l_icloglock); + + if (wake) + sv_broadcast(&log->l_flush_wait); +} /* @@ -2267,22 +2353,20 @@ xlog_state_do_callback( * the second completion goes through. * * Callbacks could take time, so they are done outside the scope of the - * global state machine log lock. Assume that the calls to cvsema won't - * take a long time. At least we know it won't sleep. + * global state machine log lock. */ -void +STATIC void xlog_state_done_syncing( xlog_in_core_t *iclog, int aborted) { xlog_t *log = iclog->ic_log; - SPLDECL(s); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_IOERROR); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); @@ -2294,7 +2378,7 @@ xlog_state_done_syncing( */ if (iclog->ic_state != XLOG_STATE_IOERROR) { if (--iclog->ic_bwritecnt == 1) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return; } iclog->ic_state = XLOG_STATE_DONE_SYNC; @@ -2305,19 +2389,17 @@ xlog_state_done_syncing( * iclog buffer, we wake them all, one will get to do the * I/O, the others get to wait for the result. */ - sv_broadcast(&iclog->ic_writesema); - LOG_UNLOCK(log, s); + sv_broadcast(&iclog->ic_write_wait); + spin_unlock(&log->l_icloglock); xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ } /* xlog_state_done_syncing */ /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must - * sleep. The flush semaphore is set to the number of in-core buffers and - * decremented around disk syncing. Therefore, if all buffers are syncing, - * this semaphore will cause new writes to sleep until a sync completes. - * Otherwise, this code just does p() followed by v(). This approximates - * a sleep/wakeup except we can't race. + * sleep. We wait on the flush queue on the head iclog as that should be + * the first iclog to complete flushing. Hence if all iclogs are syncing, + * we will wait here and all new writes will sleep until a sync completes. * * The in-core logs are used in a circular fashion. They are not used * out-of-order even when an iclog past the head is free. @@ -2331,7 +2413,7 @@ xlog_state_done_syncing( * needs to be incremented, depending on the amount of data which * is copied. */ -int +STATIC int xlog_state_get_iclog_space(xlog_t *log, int len, xlog_in_core_t **iclogp, @@ -2339,33 +2421,31 @@ xlog_state_get_iclog_space(xlog_t *log, int *continued_write, int *logoffsetp) { - SPLDECL(s); int log_offset; xlog_rec_header_t *head; xlog_in_core_t *iclog; int error; restart: - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } iclog = log->l_iclog; - if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) { - log->l_flushcnt++; - LOG_UNLOCK(log, s); + if (iclog->ic_state != XLOG_STATE_ACTIVE) { xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); XFS_STATS_INC(xs_log_noiclogs); - /* Ensure that log writes happen */ - psema(&log->l_flushsema, PINOD); + + /* Wait for log writes to have flushed */ + sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); goto restart; } - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + head = &iclog->ic_header; - iclog->ic_refcnt++; /* prevents sync */ + atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; /* On the 1st write to an iclog, figure out lsn. This works @@ -2375,11 +2455,12 @@ restart: */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; - XLOG_TIC_ADD_REGION(ticket, + xlog_tic_add_region(ticket, log->l_iclog_hsize, XLOG_REG_TYPE_LRHEADER); - INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle); - ASSIGN_LSN(head->h_lsn, log); + head->h_cycle = cpu_to_be32(log->l_curr_cycle); + head->h_lsn = cpu_to_be64( + xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); ASSERT(log->l_curr_block >= 0); } @@ -2395,14 +2476,21 @@ restart: if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - /* If I'm the only one writing to this iclog, sync it to disk */ - if (iclog->ic_refcnt == 1) { - LOG_UNLOCK(log, s); - if ((error = xlog_state_release_iclog(log, iclog))) + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + if (error) return error; } else { - iclog->ic_refcnt--; - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } goto restart; } @@ -2423,7 +2511,7 @@ restart: *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); *logoffsetp = log_offset; return 0; @@ -2441,7 +2529,6 @@ xlog_grant_log_space(xlog_t *log, { int free_bytes; int need_bytes; - SPLDECL(s); #ifdef DEBUG xfs_lsn_t tail_lsn; #endif @@ -2453,7 +2540,7 @@ xlog_grant_log_space(xlog_t *log, #endif /* Is there space or do we need to sleep? */ - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter"); /* something is already sleeping; insert new transaction at end */ @@ -2469,14 +2556,14 @@ xlog_grant_log_space(xlog_t *log, goto error_return; XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* * If we got an error, and the filesystem is shutting down, * we'll catch it down below. So just continue... */ xlog_trace_loggrant(log, tic, "xlog_grant_log_space: wake 1"); - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); } if (tic->t_flags & XFS_LOG_PERM_RESERV) need_bytes = tic->t_unit_res*tic->t_ocnt; @@ -2495,17 +2582,17 @@ redo: xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 2"); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); if (XLOG_FORCED_SHUTDOWN(log)) { - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); goto error_return; } xlog_trace_loggrant(log, tic, "xlog_grant_log_space: wake 2"); xlog_grant_push_ail(log->l_mp, need_bytes); - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); @@ -2527,7 +2614,7 @@ redo: #endif xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit"); xlog_verify_grant_head(log, 1); - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); return 0; error_return: @@ -2541,7 +2628,7 @@ redo: */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_grant_log_space */ @@ -2555,7 +2642,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log, xlog_ticket_t *tic) { - SPLDECL(s); int free_bytes, need_bytes; xlog_ticket_t *ntic; #ifdef DEBUG @@ -2563,7 +2649,7 @@ xlog_regrant_write_log_space(xlog_t *log, #endif tic->t_curr_res = tic->t_unit_res; - XLOG_TIC_RESET_RES(tic); + xlog_tic_reset_res(tic); if (tic->t_cnt > 0) return 0; @@ -2573,7 +2659,7 @@ xlog_regrant_write_log_space(xlog_t *log, panic("regrant Recovery problem"); #endif - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter"); if (XLOG_FORCED_SHUTDOWN(log)) @@ -2595,7 +2681,7 @@ xlog_regrant_write_log_space(xlog_t *log, if (free_bytes < ntic->t_unit_res) break; free_bytes -= ntic->t_unit_res; - sv_signal(&ntic->t_sema); + sv_signal(&ntic->t_wait); ntic = ntic->t_next; } while (ntic != log->l_write_headq); @@ -2606,20 +2692,20 @@ xlog_regrant_write_log_space(xlog_t *log, xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: sleep 1"); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already * off the queue */ if (XLOG_FORCED_SHUTDOWN(log)) { - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); goto error_return; } xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 1"); xlog_grant_push_ail(log->l_mp, tic->t_unit_res); - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); } } @@ -2635,18 +2721,18 @@ redo: if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already off the queue */ if (XLOG_FORCED_SHUTDOWN(log)) { - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); goto error_return; } xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 2"); xlog_grant_push_ail(log->l_mp, need_bytes); - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_write_headq, tic); @@ -2663,7 +2749,7 @@ redo: xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); xlog_verify_grant_head(log, 1); - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); return 0; @@ -2678,7 +2764,7 @@ redo: */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_regrant_write_log_space */ @@ -2694,24 +2780,22 @@ STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket) { - SPLDECL(s); - xlog_trace_loggrant(log, ticket, "xlog_regrant_reserve_log_space: enter"); if (ticket->t_cnt > 0) ticket->t_cnt--; - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); xlog_grant_sub_space(log, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; - XLOG_TIC_RESET_RES(ticket); + xlog_tic_reset_res(ticket); xlog_trace_loggrant(log, ticket, "xlog_regrant_reserve_log_space: sub current res"); xlog_verify_grant_head(log, 1); /* just return if we still have some of the pre-reserved space */ if (ticket->t_cnt > 0) { - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); return; } @@ -2719,9 +2803,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, xlog_trace_loggrant(log, ticket, "xlog_regrant_reserve_log_space: exit"); xlog_verify_grant_head(log, 0); - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); ticket->t_curr_res = ticket->t_unit_res; - XLOG_TIC_RESET_RES(ticket); + xlog_tic_reset_res(ticket); } /* xlog_regrant_reserve_log_space */ @@ -2743,12 +2827,10 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket) { - SPLDECL(s); - if (ticket->t_cnt > 0) ticket->t_cnt--; - s = GRANT_LOCK(log); + spin_lock(&log->l_grant_lock); xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); xlog_grant_sub_space(log, ticket->t_curr_res); @@ -2765,26 +2847,12 @@ xlog_ungrant_log_space(xlog_t *log, xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); xlog_verify_grant_head(log, 1); - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); xfs_log_move_tail(log->l_mp, 1); } /* xlog_ungrant_log_space */ /* - * Atomically put back used ticket. - */ -void -xlog_state_put_ticket(xlog_t *log, - xlog_ticket_t *tic) -{ - unsigned long s; - - s = LOG_LOCK(log); - xlog_ticket_put(log, tic); - LOG_UNLOCK(log, s); -} /* xlog_state_put_ticket */ - -/* * Flush iclog to disk if this is the last reference to the given iclog and * the WANT_SYNC bit is set. * @@ -2793,36 +2861,37 @@ xlog_state_put_ticket(xlog_t *log, * * */ -int -xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog) +STATIC int +xlog_state_release_iclog( + xlog_t *log, + xlog_in_core_t *iclog) { - SPLDECL(s); int sync = 0; /* do we sync? */ - xlog_assign_tail_lsn(log->l_mp); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); - s = LOG_LOCK(log); + ASSERT(atomic_read(&iclog->ic_refcnt) > 0); + if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) + return 0; if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } - - ASSERT(iclog->ic_refcnt > 0); ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_WANT_SYNC); - if (--iclog->ic_refcnt == 0 && - iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xlog_assign_tail_lsn(log->l_mp); sync++; iclog->ic_state = XLOG_STATE_SYNCING; - INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); /* cycle incremented when incrementing curr_block */ } - - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); /* * We let the log lock go, so it's possible that we hit a log I/O @@ -2831,11 +2900,9 @@ xlog_state_release_iclog(xlog_t *log, * this iclog has consistent data, so we ignore IOERROR * flags after this point. */ - if (sync) { + if (sync) return xlog_sync(log, iclog); - } return 0; - } /* xlog_state_release_iclog */ @@ -2855,7 +2922,7 @@ xlog_state_switch_iclogs(xlog_t *log, if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; - INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block); + iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); log->l_prev_block = log->l_curr_block; log->l_prev_cycle = log->l_curr_cycle; @@ -2863,7 +2930,7 @@ xlog_state_switch_iclogs(xlog_t *log, log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); /* Round up to next log-sunit */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); log->l_curr_block = roundup(log->l_curr_block, sunit_bb); @@ -2897,7 +2964,7 @@ xlog_state_switch_iclogs(xlog_t *log, * 2. the current iclog is drity, and the previous iclog is in the * active or dirty state. * - * We may sleep (call psema) if: + * We may sleep if: * * 1. the current iclog is not in the active nor dirty state. * 2. the current iclog dirty, and the previous iclog is not in the @@ -2913,13 +2980,12 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) { xlog_in_core_t *iclog; xfs_lsn_t lsn; - SPLDECL(s); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } @@ -2936,7 +3002,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) * previous iclog and go to sleep. */ if (iclog->ic_state == XLOG_STATE_DIRTY || - (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) { + (atomic_read(&iclog->ic_refcnt) == 0 + && iclog->ic_offset == 0)) { iclog = iclog->ic_prev; if (iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY) @@ -2944,23 +3011,23 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) else goto maybe_sleep; } else { - if (iclog->ic_refcnt == 0) { + if (atomic_read(&iclog->ic_refcnt) == 0) { /* We are the only one with access to this * iclog. Flush it out now. There should * be a roundoff of zero to show that someone * has already taken care of the roundoff from * the previous sync. */ - iclog->ic_refcnt++; - lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); + atomic_inc(&iclog->ic_refcnt); + lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); *log_flushed = 1; - s = LOG_LOCK(log); - if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn && + spin_lock(&log->l_icloglock); + if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && iclog->ic_state != XLOG_STATE_DIRTY) goto maybe_sleep; else @@ -2985,16 +3052,16 @@ maybe_sleep: if (flags & XFS_LOG_SYNC) { /* * We must check if we're shutting down here, before - * we wait, while we're holding the LOG_LOCK. + * we wait, while we're holding the l_icloglock. * Then we check again after waking up, in case our * sleep was disturbed by a bad news. */ if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s); + sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3007,7 +3074,7 @@ maybe_sleep: } else { no_sleep: - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } return 0; } /* xlog_state_sync_all */ @@ -3025,7 +3092,7 @@ no_sleep: * If filesystem activity goes to zero, the iclog will get flushed only by * bdflush(). */ -int +STATIC int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags, @@ -3033,26 +3100,24 @@ xlog_state_sync(xlog_t *log, { xlog_in_core_t *iclog; int already_slept = 0; - SPLDECL(s); - try_again: - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } do { - if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) { - iclog = iclog->ic_next; - continue; + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + continue; } if (iclog->ic_state == XLOG_STATE_DIRTY) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return 0; } @@ -3079,19 +3144,19 @@ try_again: XLOG_STATE_SYNCING))) { ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_writesema, PSWP, + sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, &log->l_icloglock, s); *log_flushed = 1; already_slept = 1; goto try_again; } else { - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); *log_flushed = 1; - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); } } @@ -3099,15 +3164,15 @@ try_again: !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { /* - * Don't wait on the forcesema if we know that we've + * Don't wait on completion if we know that we've * gotten a log write error. */ if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s); + sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3117,13 +3182,13 @@ try_again: return XFS_ERROR(EIO); *log_flushed = 1; } else { /* just return */ - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } return 0; } while (iclog != log->l_iclog); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return 0; } /* xlog_state_sync */ @@ -3132,12 +3197,10 @@ try_again: * Called when we want to mark the current iclog as being ready to sync to * disk. */ -void +STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { - SPLDECL(s); - - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); @@ -3146,7 +3209,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); } - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } /* xlog_state_want_sync */ @@ -3159,95 +3222,21 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) */ /* - * Algorithm doesn't take into account page size. ;-( - */ -STATIC void -xlog_state_ticket_alloc(xlog_t *log) -{ - xlog_ticket_t *t_list; - xlog_ticket_t *next; - xfs_caddr_t buf; - uint i = (NBPP / sizeof(xlog_ticket_t)) - 2; - SPLDECL(s); - - /* - * The kmem_zalloc may sleep, so we shouldn't be holding the - * global lock. XXXmiken: may want to use zone allocator. - */ - buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP); - - s = LOG_LOCK(log); - - /* Attach 1st ticket to Q, so we can keep track of allocated memory */ - t_list = (xlog_ticket_t *)buf; - t_list->t_next = log->l_unmount_free; - log->l_unmount_free = t_list++; - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - - /* Next ticket becomes first ticket attached to ticket free list */ - if (log->l_freelist != NULL) { - ASSERT(log->l_tail != NULL); - log->l_tail->t_next = t_list; - } else { - log->l_freelist = t_list; - } - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - - /* Cycle through rest of alloc'ed memory, building up free Q */ - for ( ; i > 0; i--) { - next = t_list + 1; - t_list->t_next = next; - t_list = next; - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - } - t_list->t_next = NULL; - log->l_tail = t_list; - LOG_UNLOCK(log, s); -} /* xlog_state_ticket_alloc */ - - -/* - * Put ticket into free list - * - * Assumption: log lock is held around this call. + * Free a used ticket. */ STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket) { - sv_destroy(&ticket->t_sema); - - /* - * Don't think caching will make that much difference. It's - * more important to make debug easier. - */ -#if 0 - /* real code will want to use LIFO for caching */ - ticket->t_next = log->l_freelist; - log->l_freelist = ticket; - /* no need to clear fields */ -#else - /* When we debug, it is easier if tickets are cycled */ - ticket->t_next = NULL; - if (log->l_tail != 0) { - log->l_tail->t_next = ticket; - } else { - ASSERT(log->l_freelist == 0); - log->l_freelist = ticket; - } - log->l_tail = ticket; -#endif /* DEBUG */ - log->l_ticket_cnt++; + sv_destroy(&ticket->t_wait); + kmem_zone_free(xfs_log_ticket_zone, ticket); } /* xlog_ticket_put */ /* - * Grab ticket off freelist or allocation some more + * Allocate and initialise a new log ticket. */ -xlog_ticket_t * +STATIC xlog_ticket_t * xlog_ticket_get(xlog_t *log, int unit_bytes, int cnt, @@ -3256,23 +3245,10 @@ xlog_ticket_get(xlog_t *log, { xlog_ticket_t *tic; uint num_headers; - SPLDECL(s); - - alloc: - if (log->l_freelist == NULL) - xlog_state_ticket_alloc(log); /* potentially sleep */ - s = LOG_LOCK(log); - if (log->l_freelist == NULL) { - LOG_UNLOCK(log, s); - goto alloc; - } - tic = log->l_freelist; - log->l_freelist = tic->t_next; - if (log->l_freelist == NULL) - log->l_tail = NULL; - log->l_ticket_cnt--; - LOG_UNLOCK(log, s); + tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); + if (!tic) + return NULL; /* * Permanent reservations have up to 'cnt'-1 active log operations @@ -3323,7 +3299,7 @@ xlog_ticket_get(xlog_t *log, unit_bytes += sizeof(xlog_op_header_t) * num_headers; /* for roundoff padding for transaction data and one for commit record */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { /* log su roundoff */ unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; @@ -3342,9 +3318,9 @@ xlog_ticket_get(xlog_t *log, tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_sema), SV_DEFAULT, "logtick"); + sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); - XLOG_TIC_RESET_RES(tic); + xlog_tic_reset_res(tic); return tic; } /* xlog_ticket_get */ @@ -3447,33 +3423,32 @@ xlog_verify_iclog(xlog_t *log, __uint8_t clientid; int len, i, j, k, op_len; int idx; - SPLDECL(s); /* check validity of iclog pointers */ - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); icptr = log->l_iclog; for (i=0; i < log->l_iclog_bufs; i++) { - if (icptr == 0) + if (icptr == NULL) xlog_panic("xlog_verify_iclog: invalid ptr"); icptr = icptr->ic_next; } if (icptr != log->l_iclog) xlog_panic("xlog_verify_iclog: corrupt iclog ring"); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); /* check log magic numbers */ - ptr = (xfs_caddr_t) &(iclog->ic_header); - if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) + if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) xlog_panic("xlog_verify_iclog: invalid magic num"); - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count; + ptr = (xfs_caddr_t) &iclog->ic_header; + for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; ptr += BBSIZE) { - if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) xlog_panic("xlog_verify_iclog: unexpected magic num"); } /* check fields */ - len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT); + len = be32_to_cpu(iclog->ic_header.h_num_logops); ptr = iclog->ic_datap; base_ptr = ptr; ophead = (xlog_op_header_t *)ptr; @@ -3491,9 +3466,11 @@ xlog_verify_iclog(xlog_t *log, if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); + clientid = xlog_get_client_id( + xhdr[j].hic_xheader.xh_cycle_data[k]); } else { - clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); + clientid = xlog_get_client_id( + iclog->ic_header.h_cycle_data[idx]); } } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) @@ -3505,16 +3482,16 @@ xlog_verify_iclog(xlog_t *log, field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); if (syncing == B_FALSE || (field_offset & 0x1ff)) { - op_len = INT_GET(ophead->oh_len, ARCH_CONVERT); + op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - (__psint_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); + op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); } else { - op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); + op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } ptr += sizeof(xlog_op_header_t) + op_len; @@ -3523,7 +3500,7 @@ xlog_verify_iclog(xlog_t *log, #endif /* - * Mark all iclogs IOERROR. LOG_LOCK is held by the caller. + * Mark all iclogs IOERROR. l_icloglock is held by the caller. */ STATIC int xlog_state_ioerror( @@ -3571,8 +3548,6 @@ xfs_log_force_umount( xlog_t *log; int retval; int dummy; - SPLDECL(s); - SPLDECL(s2); log = mp->m_log; @@ -3601,8 +3576,8 @@ xfs_log_force_umount( * before we mark the filesystem SHUTDOWN and wake * everybody up to tell the bad news. */ - s = GRANT_LOCK(log); - s2 = LOG_LOCK(log); + spin_lock(&log->l_icloglock); + spin_lock(&log->l_grant_lock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; XFS_BUF_DONE(mp->m_sb_bp); /* @@ -3618,7 +3593,7 @@ xfs_log_force_umount( */ if (logerror) retval = xlog_state_ioerror(log); - LOG_UNLOCK(log, s2); + spin_unlock(&log->l_icloglock); /* * We don't want anybody waiting for log reservations @@ -3630,18 +3605,18 @@ xfs_log_force_umount( */ if ((tic = log->l_reserve_headq)) { do { - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_reserve_headq); } if ((tic = log->l_write_headq)) { do { - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_write_headq); } - GRANT_UNLOCK(log, s); + spin_unlock(&log->l_grant_lock); if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); @@ -3650,9 +3625,9 @@ xfs_log_force_umount( * log down completely. */ xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); - s2 = LOG_LOCK(log); + spin_lock(&log->l_icloglock); retval = xlog_state_ioerror(log); - LOG_UNLOCK(log, s2); + spin_unlock(&log->l_icloglock); } /* * Wake up everybody waiting on xfs_log_force. @@ -3665,13 +3640,13 @@ xfs_log_force_umount( { xlog_in_core_t *iclog; - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { ASSERT(iclog->ic_callback == 0); iclog = iclog->ic_next; } while (iclog != log->l_iclog); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } #endif /* return non-zero if log IOERROR transition had already happened */ diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index eacb3d4..d47b91f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -22,8 +22,9 @@ #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) #define BLOCK_LSN(lsn) ((uint)(lsn)) + /* this is used in a spot where we might otherwise double-endian-flip */ -#define CYCLE_LSN_DISK(lsn) (((uint *)&(lsn))[0]) +#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) #ifdef __KERNEL__ /* @@ -48,16 +49,10 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) */ /* - * Flags to xfs_log_mount - */ -#define XFS_LOG_RECOVER 0x1 - -/* * Flags to xfs_log_done() */ #define XFS_LOG_REL_PERM_RESERV 0x1 - /* * Flags to xfs_log_reserve() * @@ -70,8 +65,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LOG_SLEEP 0x0 #define XFS_LOG_NOSLEEP 0x1 #define XFS_LOG_PERM_RESERV 0x2 -#define XFS_LOG_RESV_ALL (XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV) - /* * Flags to xfs_log_force() @@ -149,13 +142,14 @@ int _xfs_log_force(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, int *log_forced); -#define xfs_log_force(mp, lsn, flags) \ - _xfs_log_force(mp, lsn, flags, NULL); +void xfs_log_force(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, int num_bblocks); -int xfs_log_mount_finish(struct xfs_mount *mp, int); +int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_move_tail(struct xfs_mount *mp, xfs_lsn_t tail_lsn); int xfs_log_notify(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 34bcbf5..e7d8f84 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -30,18 +30,16 @@ struct xfs_mount; */ #define XLOG_MIN_ICLOGS 2 -#define XLOG_MED_ICLOGS 4 #define XLOG_MAX_ICLOGS 8 -#define XLOG_CALLBACK_SIZE 10 #define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ #define XLOG_VERSION_1 1 #define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ #define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) -#define XLOG_RECORD_BSIZE (16*1024) /* eventually 32k */ +#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ #define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ #define XLOG_MAX_RECORD_BSIZE (256*1024) #define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ -#define XLOG_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ +#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ #define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ @@ -51,38 +49,27 @@ struct xfs_mount; #define XLOG_HEADER_SIZE 512 #define XLOG_REC_SHIFT(log) \ - BTOBB(1 << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \ + BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) #define XLOG_TOTAL_REC_SHIFT(log) \ - BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) -/* - * set lsns - */ -#define ASSIGN_ANY_LSN_HOST(lsn,cycle,block) \ - { \ - (lsn) = ((xfs_lsn_t)(cycle)<<32)|(block); \ - } -#define ASSIGN_ANY_LSN_DISK(lsn,cycle,block) \ - { \ - INT_SET(((uint *)&(lsn))[0], ARCH_CONVERT, (cycle)); \ - INT_SET(((uint *)&(lsn))[1], ARCH_CONVERT, (block)); \ - } -#define ASSIGN_LSN(lsn,log) \ - ASSIGN_ANY_LSN_DISK(lsn,(log)->l_curr_cycle,(log)->l_curr_block); - -#define XLOG_SET(f,b) (((f) & (b)) == (b)) - -#define GET_CYCLE(ptr, arch) \ - (INT_GET(*(uint *)(ptr), arch) == XLOG_HEADER_MAGIC_NUM ? \ - INT_GET(*((uint *)(ptr)+1), arch) : \ - INT_GET(*(uint *)(ptr), arch) \ - ) +static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) +{ + return ((xfs_lsn_t)cycle << 32) | block; +} -#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) +static inline uint xlog_get_cycle(char *ptr) +{ + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + return be32_to_cpu(*((__be32 *)ptr + 1)); + else + return be32_to_cpu(*(__be32 *)ptr); +} +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) #ifdef __KERNEL__ @@ -98,19 +85,10 @@ struct xfs_mount; * * this has endian issues, of course. */ - -#ifndef XFS_NATIVE_HOST -#define GET_CLIENT_ID(i,arch) \ - ((i) & 0xff) -#else -#define GET_CLIENT_ID(i,arch) \ - ((i) >> 24) -#endif - -#define GRANT_LOCK(log) mutex_spinlock(&(log)->l_grant_lock) -#define GRANT_UNLOCK(log, s) mutex_spinunlock(&(log)->l_grant_lock, s) -#define LOG_LOCK(log) mutex_spinlock(&(log)->l_icloglock) -#define LOG_UNLOCK(log, s) mutex_spinunlock(&(log)->l_icloglock, s) +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} #define xlog_panic(args...) cmn_err(CE_PANIC, ## args) #define xlog_exit(args...) cmn_err(CE_PANIC, ## args) @@ -149,9 +127,6 @@ struct xfs_mount; #define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ -#define XLOG_SKIP_TRANS (XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \ - XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \ - XLOG_UNMOUNT_TRANS) #ifdef __KERNEL__ /* @@ -254,22 +229,6 @@ typedef __uint32_t xlog_tid_t; /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 -#define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \ - (t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0) -#define XLOG_TIC_ADD_OPHDR(t) ((t)->t_res_num_ophdrs++) -#define XLOG_TIC_ADD_REGION(t, len, type) \ - do { \ - if ((t)->t_res_num == XLOG_TIC_LEN_MAX) { \ - /* add to overflow and start again */ \ - (t)->t_res_o_flow += (t)->t_res_arr_sum; \ - (t)->t_res_num = 0; \ - (t)->t_res_arr_sum = 0; \ - } \ - (t)->t_res_arr[(t)->t_res_num].r_len = (len); \ - (t)->t_res_arr[(t)->t_res_num].r_type = (type); \ - (t)->t_res_arr_sum += (len); \ - (t)->t_res_num++; \ - } while (0) /* * Reservation region @@ -282,8 +241,8 @@ typedef struct xlog_res { } xlog_res_t; typedef struct xlog_ticket { - sv_t t_sema; /* sleep on this semaphore : 20 */ - struct xlog_ticket *t_next; /* :4|8 */ + sv_t t_wait; /* ticket wait queue : 20 */ + struct xlog_ticket *t_next; /* :4|8 */ struct xlog_ticket *t_prev; /* :4|8 */ xlog_tid_t t_tid; /* transaction identifier : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ @@ -306,11 +265,11 @@ typedef struct xlog_ticket { typedef struct xlog_op_header { - xlog_tid_t oh_tid; /* transaction id of operation : 4 b */ - int oh_len; /* bytes in data region : 4 b */ - __uint8_t oh_clientid; /* who sent me this : 1 b */ - __uint8_t oh_flags; /* : 1 b */ - ushort oh_res2; /* 32 bit align : 2 b */ + __be32 oh_tid; /* transaction id of operation : 4 b */ + __be32 oh_len; /* bytes in data region : 4 b */ + __u8 oh_clientid; /* who sent me this : 1 b */ + __u8 oh_flags; /* : 1 b */ + __u16 oh_res2; /* 32 bit align : 2 b */ } xlog_op_header_t; @@ -328,25 +287,25 @@ typedef struct xlog_op_header { #endif typedef struct xlog_rec_header { - uint h_magicno; /* log record (LR) identifier : 4 */ - uint h_cycle; /* write cycle of log : 4 */ - int h_version; /* LR version : 4 */ - int h_len; /* len in bytes; should be 64-bit aligned: 4 */ - xfs_lsn_t h_lsn; /* lsn of this LR : 8 */ - xfs_lsn_t h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - uint h_chksum; /* may not be used; non-zero if used : 4 */ - int h_prev_block; /* block number to previous LR : 4 */ - int h_num_logops; /* number of log operations in this LR : 4 */ - uint h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + __be32 h_magicno; /* log record (LR) identifier : 4 */ + __be32 h_cycle; /* write cycle of log : 4 */ + __be32 h_version; /* LR version : 4 */ + __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ + __be64 h_lsn; /* lsn of this LR : 8 */ + __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __be32 h_prev_block; /* block number to previous LR : 4 */ + __be32 h_num_logops; /* number of log operations in this LR : 4 */ + __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* new fields */ - int h_fmt; /* format of log record : 4 */ - uuid_t h_fs_uuid; /* uuid of FS : 16 */ - int h_size; /* iclog size : 4 */ + __be32 h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + __be32 h_size; /* iclog size : 4 */ } xlog_rec_header_t; typedef struct xlog_rec_ext_header { - uint xh_cycle; /* write cycle of log : 4 */ - uint xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ + __be32 xh_cycle; /* write cycle of log : 4 */ + __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ } xlog_rec_ext_header_t; #ifdef __KERNEL__ @@ -355,7 +314,7 @@ typedef struct xlog_rec_ext_header { * xlog_rec_header_t into the reserved space. * - ic_data follows, so a write to disk can start at the beginning of * the iclog. - * - ic_forcesema is used to implement synchronous forcing of the iclog to disk. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_bp is a pointer to the buffer used to write this incore log to disk. * - ic_log is a pointer back to the global log structure. @@ -365,25 +324,43 @@ typedef struct xlog_rec_ext_header { * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. */ typedef struct xlog_iclog_fields { - sv_t ic_forcesema; - sv_t ic_writesema; + sv_t ic_force_wait; + sv_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; struct log *ic_log; - xfs_log_callback_t *ic_callback; - xfs_log_callback_t **ic_callback_tail; -#ifdef XFS_LOG_TRACE - struct ktrace *ic_trace; -#endif int ic_size; int ic_offset; - int ic_refcnt; int ic_bwritecnt; ushort_t ic_state; char *ic_datap; /* pointer to iclog data */ +#ifdef XFS_LOG_TRACE + struct ktrace *ic_trace; +#endif + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + xfs_log_callback_t *ic_callback; + xfs_log_callback_t **ic_callback_tail; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; } xlog_iclog_fields_t; typedef union xlog_in_core2 { @@ -400,13 +377,14 @@ typedef struct xlog_in_core { /* * Defines to save our code from this glop. */ -#define ic_forcesema hic_fields.ic_forcesema -#define ic_writesema hic_fields.ic_writesema +#define ic_force_wait hic_fields.ic_force_wait +#define ic_write_wait hic_fields.ic_write_wait #define ic_next hic_fields.ic_next #define ic_prev hic_fields.ic_prev #define ic_bp hic_fields.ic_bp #define ic_log hic_fields.ic_log #define ic_callback hic_fields.ic_callback +#define ic_callback_lock hic_fields.ic_callback_lock #define ic_callback_tail hic_fields.ic_callback_tail #define ic_trace hic_fields.ic_trace #define ic_size hic_fields.ic_size @@ -424,43 +402,44 @@ typedef struct xlog_in_core { * that round off problems won't occur when releasing partial reservations. */ typedef struct log { - /* The following block of fields are changed while holding icloglock */ - sema_t l_flushsema; /* iclog flushing semaphore */ - int l_flushcnt; /* # of procs waiting on this - * sema */ - int l_ticket_cnt; /* free ticket count */ - int l_ticket_tcnt; /* total ticket count */ - int l_covered_state;/* state of "covering disk - * log entries" */ - xlog_ticket_t *l_freelist; /* free list of tickets */ - xlog_ticket_t *l_unmount_free;/* kmem_free these addresses */ - xlog_ticket_t *l_tail; /* free list of tickets */ - xlog_in_core_t *l_iclog; /* head log queue */ - lock_t l_icloglock; /* grab to change iclog state */ - xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed - * buffers */ - xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ + /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct xfs_buf_cancel **l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectbb_log; /* log2 of sector size in BBs */ + uint l_sectbb_mask; /* sector size (in BBs) + * alignment mask */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ xfs_daddr_t l_logBBstart; /* start block of log */ int l_logsize; /* size of log in bytes */ int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + sv_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed + * buffers */ + xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last * block increment */ int l_curr_block; /* current logical log block */ int l_prev_block; /* previous logical log block */ - int l_iclog_size; /* size of log in bytes */ - int l_iclog_size_log; /* log power size of log */ - int l_iclog_bufs; /* number of iclog buffers */ - - /* The following field are used for debugging; need to hold icloglock */ - char *l_iclog_bak[XLOG_MAX_ICLOGS]; /* The following block of fields are changed while holding grant_lock */ - lock_t l_grant_lock; + spinlock_t l_grant_lock ____cacheline_aligned_in_smp; xlog_ticket_t *l_reserve_headq; xlog_ticket_t *l_write_headq; int l_grant_reserve_cycle; @@ -468,19 +447,15 @@ typedef struct log { int l_grant_write_cycle; int l_grant_write_bytes; - /* The following fields don't need locking */ #ifdef XFS_LOG_TRACE - struct ktrace *l_trace; struct ktrace *l_grant_trace; #endif - uint l_flags; - uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ - struct xfs_buf_cancel **l_buf_cancel_table; - int l_iclog_hsize; /* size of iclog header */ - int l_iclog_heads; /* # of iclog header sectors */ - uint l_sectbb_log; /* log2 of sector size in BBs */ - uint l_sectbb_mask; /* sector size (in BBs) - * alignment mask */ + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + char *l_iclog_bak[XLOG_MAX_ICLOGS]; +#endif + } xlog_t; #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) @@ -492,7 +467,7 @@ extern int xlog_find_tail(xlog_t *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk); extern int xlog_recover(xlog_t *log); -extern int xlog_recover_finish(xlog_t *log, int mfsi_flags); +extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern void xlog_recover_process_iunlinks(xlog_t *log); @@ -500,12 +475,20 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); +extern kmem_zone_t *xfs_log_ticket_zone; + /* iclog tracing */ #define XLOG_TRACE_GRAB_FLUSH 1 #define XLOG_TRACE_REL_FLUSH 2 #define XLOG_TRACE_SLEEP_FLUSH 3 #define XLOG_TRACE_WAKE_FLUSH 4 +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + #endif /* __KERNEL__ */ #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3cb678e..70e3ba3 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -46,6 +46,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_rw.h" +#include "xfs_utils.h" STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); @@ -120,7 +121,8 @@ xlog_bread( XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); xfsbdstrat(log->l_mp, bp); - if ((error = xfs_iowait(bp))) + error = xfs_iowait(bp); + if (error) xfs_ioerror_alert("xlog_bread", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; @@ -191,14 +193,14 @@ xlog_header_check_dump( { int b; - cmn_err(CE_DEBUG, "%s: SB : uuid = ", __FUNCTION__); + cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); for (b = 0; b < 16; b++) cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); cmn_err(CE_DEBUG, " log : uuid = "); for (b = 0; b < 16; b++) cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); - cmn_err(CE_DEBUG, ", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT)); + cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); } #else #define xlog_header_check_dump(mp, head) @@ -212,14 +214,14 @@ xlog_header_check_recover( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM); + ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); /* * IRIX doesn't write the h_fmt field and leaves it zeroed * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ - if (unlikely(INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT)) { + if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { xlog_warn( "XFS: dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); @@ -245,7 +247,7 @@ xlog_header_check_mount( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM); + ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); if (uuid_is_nil(&head->h_fs_uuid)) { /* @@ -293,7 +295,7 @@ xlog_recover_iodone( * Note that the algorithm can not be perfect because the disk will not * necessarily be perfect. */ -int +STATIC int xlog_find_cycle_start( xlog_t *log, xfs_buf_t *bp, @@ -311,7 +313,7 @@ xlog_find_cycle_start( if ((error = xlog_bread(log, mid_blk, 1, bp))) return error; offset = xlog_align(log, mid_blk, 1, bp); - mid_cycle = GET_CYCLE(offset, ARCH_CONVERT); + mid_cycle = xlog_get_cycle(offset); if (mid_cycle == cycle) { *last_blk = mid_blk; /* last_half_cycle == mid_cycle */ @@ -371,7 +373,7 @@ xlog_find_verify_cycle( buf = xlog_align(log, i, bcount, bp); for (j = 0; j < bcount; j++) { - cycle = GET_CYCLE(buf, ARCH_CONVERT); + cycle = xlog_get_cycle(buf); if (cycle == stop_on_cycle_no) { *new_blk = i+j; goto out; @@ -447,8 +449,7 @@ xlog_find_verify_log_record( head = (xlog_rec_header_t *)offset; - if (XLOG_HEADER_MAGIC_NUM == - INT_GET(head->h_magicno, ARCH_CONVERT)) + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno)) break; if (!smallmem) @@ -479,8 +480,8 @@ xlog_find_verify_log_record( * reset last_blk. Only when last_blk points in the middle of a log * record do we update last_blk. */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { - uint h_size = INT_GET(head->h_size, ARCH_CONVERT); + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + uint h_size = be32_to_cpu(head->h_size); xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; if (h_size % XLOG_HEADER_CYCLE_SIZE) @@ -489,8 +490,8 @@ xlog_find_verify_log_record( xhdrs = 1; } - if (*last_blk - i + extra_bblks - != BTOBB(INT_GET(head->h_len, ARCH_CONVERT)) + xhdrs) + if (*last_blk - i + extra_bblks != + BTOBB(be32_to_cpu(head->h_len)) + xhdrs) *last_blk = i; out: @@ -550,13 +551,13 @@ xlog_find_head( if ((error = xlog_bread(log, 0, 1, bp))) goto bp_err; offset = xlog_align(log, 0, 1, bp); - first_half_cycle = GET_CYCLE(offset, ARCH_CONVERT); + first_half_cycle = xlog_get_cycle(offset); last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ if ((error = xlog_bread(log, last_blk, 1, bp))) goto bp_err; offset = xlog_align(log, last_blk, 1, bp); - last_half_cycle = GET_CYCLE(offset, ARCH_CONVERT); + last_half_cycle = xlog_get_cycle(offset); ASSERT(last_half_cycle != 0); /* @@ -808,7 +809,7 @@ xlog_find_tail( if ((error = xlog_bread(log, 0, 1, bp))) goto bread_err; offset = xlog_align(log, 0, 1, bp); - if (GET_CYCLE(offset, ARCH_CONVERT) == 0) { + if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ goto exit; @@ -823,8 +824,7 @@ xlog_find_tail( if ((error = xlog_bread(log, i, 1, bp))) goto bread_err; offset = xlog_align(log, i, 1, bp); - if (XLOG_HEADER_MAGIC_NUM == - INT_GET(*(uint *)offset, ARCH_CONVERT)) { + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 1; break; } @@ -841,7 +841,7 @@ xlog_find_tail( goto bread_err; offset = xlog_align(log, i, 1, bp); if (XLOG_HEADER_MAGIC_NUM == - INT_GET(*(uint*)offset, ARCH_CONVERT)) { + be32_to_cpu(*(__be32 *)offset)) { found = 2; break; } @@ -855,7 +855,7 @@ xlog_find_tail( /* find blk_no of tail of log */ rhead = (xlog_rec_header_t *)offset; - *tail_blk = BLOCK_LSN(INT_GET(rhead->h_tail_lsn, ARCH_CONVERT)); + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); /* * Reset log values according to the state of the log when we @@ -869,11 +869,11 @@ xlog_find_tail( */ log->l_prev_block = i; log->l_curr_block = (int)*head_blk; - log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT); + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); if (found == 2) log->l_curr_cycle++; - log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT); - log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT); + log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); + log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); log->l_grant_reserve_cycle = log->l_curr_cycle; log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); log->l_grant_write_cycle = log->l_curr_cycle; @@ -890,9 +890,9 @@ xlog_find_tail( * unmount record if there is one, so we pass the lsn of the * unmount record rather than the block after it. */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { - int h_size = INT_GET(rhead->h_size, ARCH_CONVERT); - int h_version = INT_GET(rhead->h_version, ARCH_CONVERT); + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rhead->h_size); + int h_version = be32_to_cpu(rhead->h_version); if ((h_version & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { @@ -906,10 +906,10 @@ xlog_find_tail( hblks = 1; } after_umount_blk = (i + hblks + (int) - BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize; + BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; tail_lsn = log->l_tail_lsn; if (*head_blk == after_umount_blk && - INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) { + be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { goto bread_err; @@ -922,11 +922,21 @@ xlog_find_tail( * log records will point recovery to after the * current unmount record. */ - ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, log->l_curr_cycle, - after_umount_blk); - ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle, - after_umount_blk); + log->l_tail_lsn = + xlog_assign_lsn(log->l_curr_cycle, + after_umount_blk); + log->l_last_sync_lsn = + xlog_assign_lsn(log->l_curr_cycle, + after_umount_blk); *tail_blk = after_umount_blk; + + /* + * Note that the unmount was clean. If the unmount + * was not clean, we need to know this to rebuild the + * superblock counters from the perag headers if we + * have a filesystem using non-persistent counters. + */ + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; } } @@ -978,7 +988,7 @@ exit: * -1 => use *blk_no as the first block of the log * >0 => error has occurred */ -int +STATIC int xlog_find_zeroed( xlog_t *log, xfs_daddr_t *blk_no) @@ -999,7 +1009,7 @@ xlog_find_zeroed( if ((error = xlog_bread(log, 0, 1, bp))) goto bp_err; offset = xlog_align(log, 0, 1, bp); - first_cycle = GET_CYCLE(offset, ARCH_CONVERT); + first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; xlog_put_bp(bp); @@ -1010,7 +1020,7 @@ xlog_find_zeroed( if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) goto bp_err; offset = xlog_align(log, log_bbnum-1, 1, bp); - last_cycle = GET_CYCLE(offset, ARCH_CONVERT); + last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); return 0; @@ -1090,13 +1100,13 @@ xlog_add_record( xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; memset(buf, 0, BBSIZE); - INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM); - INT_SET(recp->h_cycle, ARCH_CONVERT, cycle); - INT_SET(recp->h_version, ARCH_CONVERT, - XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1); - ASSIGN_ANY_LSN_DISK(recp->h_lsn, cycle, block); - ASSIGN_ANY_LSN_DISK(recp->h_tail_lsn, tail_cycle, tail_block); - INT_SET(recp->h_fmt, ARCH_CONVERT, XLOG_FMT); + recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + recp->h_cycle = cpu_to_be32(cycle); + recp->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); + recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); + recp->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); } @@ -1152,10 +1162,14 @@ xlog_write_log_records( if (j == 0 && (start_block + endcount > ealign)) { offset = XFS_BUF_PTR(bp); balign = BBTOB(ealign - start_block); - XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb)); - if ((error = xlog_bread(log, ealign, sectbb, bp))) + error = XFS_BUF_SET_PTR(bp, offset + balign, + BBTOB(sectbb)); + if (!error) + error = xlog_bread(log, ealign, sectbb, bp); + if (!error) + error = XFS_BUF_SET_PTR(bp, offset, bufblks); + if (error) break; - XFS_BUF_SET_PTR(bp, offset, bufblks); } offset = xlog_align(log, start_block, endcount, bp); @@ -1358,7 +1372,7 @@ xlog_recover_add_to_cont_trans( int old_len; item = trans->r_itemq; - if (item == 0) { + if (item == NULL) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); ptr = (xfs_caddr_t) &trans->r_theader + @@ -1404,8 +1418,14 @@ xlog_recover_add_to_trans( if (!len) return 0; item = trans->r_itemq; - if (item == 0) { - ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC); + if (item == NULL) { + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xlog_warn("XFS: xlog_recover_add_to_trans: " + "bad header magic number"); + ASSERT(0); + return XFS_ERROR(EIO); + } if (len == sizeof(xfs_trans_header_t)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); /* d, s, l */ @@ -1459,12 +1479,12 @@ xlog_recover_unlink_tid( xlog_recover_t *tp; int found = 0; - ASSERT(trans != 0); + ASSERT(trans != NULL); if (trans == *q) { *q = (*q)->r_next; } else { tp = *q; - while (tp != 0) { + while (tp) { if (tp->r_next == trans) { found = 1; break; @@ -1487,7 +1507,7 @@ xlog_recover_insert_item_backq( xlog_recover_item_t **q, xlog_recover_item_t *item) { - if (*q == 0) { + if (*q == NULL) { item->ri_prev = item->ri_next = item; *q = item; } else { @@ -1509,12 +1529,10 @@ xlog_recover_insert_item_frontq( STATIC int xlog_recover_reorder_trans( - xlog_t *log, xlog_recover_t *trans) { xlog_recover_item_t *first_item, *itemq, *itemq_next; xfs_buf_log_format_t *buf_f; - xfs_buf_log_format_v1_t *obuf_f; ushort flags = 0; first_item = itemq = trans->r_itemq; @@ -1522,29 +1540,16 @@ xlog_recover_reorder_trans( do { itemq_next = itemq->ri_next; buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; - switch (ITEM_TYPE(itemq)) { - case XFS_LI_BUF: - flags = buf_f->blf_flags; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - flags = obuf_f->blf_flags; - break; - } switch (ITEM_TYPE(itemq)) { case XFS_LI_BUF: - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: + flags = buf_f->blf_flags; if (!(flags & XFS_BLI_CANCEL)) { xlog_recover_insert_item_frontq(&trans->r_itemq, itemq); break; } case XFS_LI_INODE: - case XFS_LI_6_1_INODE: - case XFS_LI_5_3_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: case XFS_LI_EFD: @@ -1583,7 +1588,6 @@ xlog_recover_do_buffer_pass1( xfs_buf_cancel_t *nextp; xfs_buf_cancel_t *prevp; xfs_buf_cancel_t **bucket; - xfs_buf_log_format_v1_t *obuf_f; xfs_daddr_t blkno = 0; uint len = 0; ushort flags = 0; @@ -1594,13 +1598,6 @@ xlog_recover_do_buffer_pass1( len = buf_f->blf_len; flags = buf_f->blf_flags; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = (xfs_daddr_t) obuf_f->blf_blkno; - len = obuf_f->blf_len; - flags = obuf_f->blf_flags; - break; } /* @@ -1724,8 +1721,7 @@ xlog_check_buffer_cancelled( } else { prevp->bc_next = bcp->bc_next; } - kmem_free(bcp, - sizeof(xfs_buf_cancel_t)); + kmem_free(bcp); } } return 1; @@ -1746,7 +1742,6 @@ xlog_recover_do_buffer_pass2( xlog_t *log, xfs_buf_log_format_t *buf_f) { - xfs_buf_log_format_v1_t *obuf_f; xfs_daddr_t blkno = 0; ushort flags = 0; uint len = 0; @@ -1757,13 +1752,6 @@ xlog_recover_do_buffer_pass2( flags = buf_f->blf_flags; len = buf_f->blf_len; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = (xfs_daddr_t) obuf_f->blf_blkno; - flags = obuf_f->blf_flags; - len = (xfs_daddr_t) obuf_f->blf_len; - break; } return xlog_check_buffer_cancelled(log, blkno, len, flags); @@ -1799,7 +1787,6 @@ xlog_recover_do_inode_buffer( int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - xfs_buf_log_format_v1_t *obuf_f; unsigned int *data_map = NULL; unsigned int map_size = 0; @@ -1808,12 +1795,6 @@ xlog_recover_do_inode_buffer( data_map = buf_f->blf_data_map; map_size = buf_f->blf_map_size; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - data_map = obuf_f->blf_data_map; - map_size = obuf_f->blf_map_size; - break; } /* * Set the variables corresponding to the current region to @@ -1904,7 +1885,6 @@ xlog_recover_do_inode_buffer( /*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( - xfs_mount_t *mp, xlog_recover_item_t *item, xfs_buf_t *bp, xfs_buf_log_format_t *buf_f) @@ -1912,7 +1892,6 @@ xlog_recover_do_reg_buffer( int i; int bit; int nbits; - xfs_buf_log_format_v1_t *obuf_f; unsigned int *data_map = NULL; unsigned int map_size = 0; int error; @@ -1922,12 +1901,6 @@ xlog_recover_do_reg_buffer( data_map = buf_f->blf_data_map; map_size = buf_f->blf_map_size; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - data_map = obuf_f->blf_data_map; - map_size = obuf_f->blf_map_size; - break; } bit = 0; i = 1; /* 0 is the buf format structure */ @@ -1937,7 +1910,7 @@ xlog_recover_do_reg_buffer( break; nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != 0); + ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); ASSERT(XFS_BUF_COUNT(bp) >= ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); @@ -2127,7 +2100,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return; - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(item, bp, buf_f); } /* @@ -2160,7 +2133,6 @@ xlog_recover_do_buffer_trans( int pass) { xfs_buf_log_format_t *buf_f; - xfs_buf_log_format_v1_t *obuf_f; xfs_mount_t *mp; xfs_buf_t *bp; int error; @@ -2197,13 +2169,6 @@ xlog_recover_do_buffer_trans( len = buf_f->blf_len; flags = buf_f->blf_flags; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = obuf_f->blf_blkno; - len = obuf_f->blf_len; - flags = obuf_f->blf_flags; - break; default: xfs_fs_cmn_err(CE_ALERT, log->l_mp, "xfs_log_recover: unknown buffer type 0x%x, logdev %s", @@ -2236,7 +2201,7 @@ xlog_recover_do_buffer_trans( (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(item, bp, buf_f); } if (error) return XFS_ERROR(error); @@ -2257,7 +2222,7 @@ xlog_recover_do_buffer_trans( * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == - INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) && + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { XFS_BUF_STALE(bp); @@ -2291,7 +2256,7 @@ xlog_recover_do_inode_trans( int error; int attr_index; uint fields; - xfs_dinode_core_t *dicp; + xfs_icdinode_t *dicp; int need_free = 0; if (pass == XLOG_RECOVER_PASS1) { @@ -2326,7 +2291,9 @@ xlog_recover_do_inode_trans( * invalidate the buffer when we write it out below. */ imap.im_blkno = 0; - xfs_imap(log->l_mp, NULL, ino, &imap, 0); + error = xfs_imap(log->l_mp, NULL, ino, &imap, 0); + if (error) + goto error; } /* @@ -2355,7 +2322,7 @@ xlog_recover_do_inode_trans( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC)) { + if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) { xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", @@ -2365,7 +2332,7 @@ xlog_recover_do_inode_trans( error = EFSCORRUPTED; goto error; } - dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr); + dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr); if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, @@ -2378,15 +2345,13 @@ xlog_recover_do_inode_trans( } /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < - INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)) { + if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers */ - if ((INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT) - == DI_MAX_FLUSH) && - (dicp->di_flushiter < (DI_MAX_FLUSH>>1))) { + if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH && + dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { xfs_buf_relse(bp); @@ -2457,8 +2422,8 @@ xlog_recover_do_inode_trans( } /* The core is in in-core format */ - xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, - (xfs_dinode_core_t*)item->ri_buf[1].i_addr, -1); + xfs_dinode_to_disk(&dip->di_core, + (xfs_icdinode_t *)item->ri_buf[1].i_addr); /* the rest is in on-disk format */ if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) { @@ -2470,8 +2435,7 @@ xlog_recover_do_inode_trans( fields = in_f->ilf_fields; switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { case XFS_ILOG_DEV: - INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev); - + dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev); break; case XFS_ILOG_UUID: dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid; @@ -2560,7 +2524,7 @@ write_inode_buffer: error: if (need_free) - kmem_free(in_f, sizeof(*in_f)); + kmem_free(in_f); return XFS_ERROR(error); } @@ -2630,8 +2594,7 @@ xlog_recover_do_dquot_trans( /* * This type of quotas was turned off, so ignore this record. */ - type = INT_GET(recddq->d_flags, ARCH_CONVERT) & - (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); ASSERT(type); if (log->l_quotaoffs_flag & type) return (0); @@ -2709,7 +2672,6 @@ xlog_recover_do_efi_trans( xfs_mount_t *mp; xfs_efi_log_item_t *efip; xfs_efi_log_format_t *efi_formatp; - SPLDECL(s); if (pass == XLOG_RECOVER_PASS1) { return 0; @@ -2727,11 +2689,11 @@ xlog_recover_do_efi_trans( efip->efi_next_extent = efi_formatp->efi_nextents; efip->efi_flags |= XFS_EFI_COMMITTED; - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); /* * xfs_trans_update_ail() drops the AIL lock. */ - xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s); + xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn); return 0; } @@ -2756,7 +2718,6 @@ xlog_recover_do_efd_trans( xfs_log_item_t *lip; int gen; __uint64_t efi_id; - SPLDECL(s); if (pass == XLOG_RECOVER_PASS1) { return; @@ -2774,7 +2735,7 @@ xlog_recover_do_efd_trans( * in the AIL. */ mp = log->l_mp; - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); lip = xfs_trans_first_ail(mp, &gen); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { @@ -2784,22 +2745,14 @@ xlog_recover_do_efd_trans( * xfs_trans_delete_ail() drops the * AIL lock. */ - xfs_trans_delete_ail(mp, lip, s); - break; + xfs_trans_delete_ail(mp, lip); + xfs_efi_item_free(efip); + return; } } lip = xfs_trans_next_ail(mp, lip, &gen, NULL); } - - /* - * If we found it, then free it up. If it wasn't there, it - * must have been overwritten in the log. Oh well. - */ - if (lip != NULL) { - xfs_efi_item_free(efip); - } else { - AIL_UNLOCK(mp, s); - } + spin_unlock(&mp->m_ail_lock); } /* @@ -2817,7 +2770,7 @@ xlog_recover_do_trans( int error = 0; xlog_recover_item_t *item, *first_item; - if ((error = xlog_recover_reorder_trans(log, trans))) + if ((error = xlog_recover_reorder_trans(trans))) return error; first_item = item = trans->r_itemq; do { @@ -2830,9 +2783,7 @@ xlog_recover_do_trans( * where xfs_daddr_t is 32-bits but mount will warn us * off a > 1 TB filesystem before we get here. */ - if ((ITEM_TYPE(item) == XFS_LI_BUF) || - (ITEM_TYPE(item) == XFS_LI_6_1_BUF) || - (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) { + if ((ITEM_TYPE(item) == XFS_LI_BUF)) { if ((error = xlog_recover_do_buffer_trans(log, item, pass))) break; @@ -2884,16 +2835,14 @@ xlog_recover_free_trans( item = item->ri_next; /* Free the regions in the item. */ for (i = 0; i < free_item->ri_cnt; i++) { - kmem_free(free_item->ri_buf[i].i_addr, - free_item->ri_buf[i].i_len); + kmem_free(free_item->ri_buf[i].i_addr); } /* Free the item itself */ - kmem_free(free_item->ri_buf, - (free_item->ri_total * sizeof(xfs_log_iovec_t))); - kmem_free(free_item, sizeof(xlog_recover_item_t)); + kmem_free(free_item->ri_buf); + kmem_free(free_item); } while (first_item != item); /* Free the transaction recover structure */ - kmem_free(trans, sizeof(xlog_recover_t)); + kmem_free(trans); } STATIC int @@ -2948,8 +2897,8 @@ xlog_recover_process_data( unsigned long hash; uint flags; - lp = dp + INT_GET(rhead->h_len, ARCH_CONVERT); - num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT); + lp = dp + be32_to_cpu(rhead->h_len); + num_logops = be32_to_cpu(rhead->h_num_logops); /* check the log format matches our own - else we can't recover */ if (xlog_header_check_recover(log->l_mp, rhead)) @@ -2966,15 +2915,20 @@ xlog_recover_process_data( ASSERT(0); return (XFS_ERROR(EIO)); } - tid = INT_GET(ohead->oh_tid, ARCH_CONVERT); + tid = be32_to_cpu(ohead->oh_tid); hash = XLOG_RHASH(tid); trans = xlog_recover_find_tid(rhash[hash], tid); if (trans == NULL) { /* not found; add new tid */ if (ohead->oh_flags & XLOG_START_TRANS) xlog_recover_new_tid(&rhash[hash], tid, - INT_GET(rhead->h_lsn, ARCH_CONVERT)); + be64_to_cpu(rhead->h_lsn)); } else { - ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp); + if (dp + be32_to_cpu(ohead->oh_len) > lp) { + xlog_warn( + "XFS: xlog_recover_process_data: bad length"); + WARN_ON(1); + return (XFS_ERROR(EIO)); + } flags = ohead->oh_flags & ~XLOG_END_TRANS; if (flags & XLOG_WAS_CONT_TRANS) flags &= ~XLOG_CONTINUE_TRANS; @@ -2988,8 +2942,7 @@ xlog_recover_process_data( break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(trans, - dp, INT_GET(ohead->oh_len, - ARCH_CONVERT)); + dp, be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: xlog_warn( @@ -3000,8 +2953,7 @@ xlog_recover_process_data( case 0: case XLOG_CONTINUE_TRANS: error = xlog_recover_add_to_trans(trans, - dp, INT_GET(ohead->oh_len, - ARCH_CONVERT)); + dp, be32_to_cpu(ohead->oh_len)); break; default: xlog_warn( @@ -3013,7 +2965,7 @@ xlog_recover_process_data( if (error) return error; } - dp += INT_GET(ohead->oh_len, ARCH_CONVERT); + dp += be32_to_cpu(ohead->oh_len); num_logops--; } return 0; @@ -3023,7 +2975,7 @@ xlog_recover_process_data( * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ -STATIC void +STATIC int xlog_recover_process_efi( xfs_mount_t *mp, xfs_efi_log_item_t *efip) @@ -3031,6 +2983,7 @@ xlog_recover_process_efi( xfs_efd_log_item_t *efdp; xfs_trans_t *tp; int i; + int error = 0; xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; @@ -3054,23 +3007,32 @@ xlog_recover_process_efi( * free the memory associated with it. */ xfs_efi_release(efip, efip->efi_format.efi_nextents); - return; + return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); - xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + if (error) + goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); - xfs_free_extent(tp, extp->ext_start, extp->ext_len); + error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + if (error) + goto abort_error; xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, extp->ext_len); } efip->efi_flags |= XFS_EFI_RECOVERED; - xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); + return error; + +abort_error: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; } /* @@ -3118,7 +3080,7 @@ xlog_recover_check_ail( * everything already in the AIL, we stop processing as soon as * we see something other than an EFI in the AIL. */ -STATIC void +STATIC int xlog_recover_process_efis( xlog_t *log) { @@ -3126,10 +3088,10 @@ xlog_recover_process_efis( xfs_efi_log_item_t *efip; int gen; xfs_mount_t *mp; - SPLDECL(s); + int error = 0; mp = log->l_mp; - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); lip = xfs_trans_first_ail(mp, &gen); while (lip != NULL) { @@ -3150,12 +3112,15 @@ xlog_recover_process_efis( continue; } - AIL_UNLOCK(mp, s); - xlog_recover_process_efi(mp, efip); - AIL_LOCK(mp,s); + spin_unlock(&mp->m_ail_lock); + error = xlog_recover_process_efi(mp, efip); + if (error) + return error; + spin_lock(&mp->m_ail_lock); lip = xfs_trans_next_ail(mp, lip, &gen, NULL); } - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); + return error; } /* @@ -3175,21 +3140,18 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); - - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); + if (!error) + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agibp); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); - return; - } + if (error) + goto out_abort; + error = EINVAL; agi = XFS_BUF_TO_AGI(agibp); - if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); - return; - } + if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) + goto out_abort; agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + @@ -3197,7 +3159,17 @@ xlog_recover_clear_agi_bucket( xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - (void) xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); + if (error) + goto out_error; + return; + +out_abort: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); +out_error: + xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " + "failed to clear agi %d. Continuing.", agno); + return; } /* @@ -3274,7 +3246,8 @@ xlog_recover_process_iunlinks( * next inode in the bucket. */ error = xfs_itobp(mp, NULL, ip, &dip, - &ibp, 0, 0); + &ibp, 0, 0, + XFS_BUF_LOCK); ASSERT(error || (dip != NULL)); } @@ -3282,8 +3255,8 @@ xlog_recover_process_iunlinks( ASSERT(ip->i_d.di_nlink == 0); /* setup for the next pass */ - agino = INT_GET(dip->di_next_unlinked, - ARCH_CONVERT); + agino = be32_to_cpu( + dip->di_next_unlinked); xfs_buf_relse(ibp); /* * Prevent any DMAPI event from @@ -3307,7 +3280,7 @@ xlog_recover_process_iunlinks( if (ip->i_d.di_mode == 0) xfs_iput_new(ip, 0); else - VN_RELE(XFS_ITOV(ip)); + IRELE(ip); } else { /* * We can't read in the inode @@ -3366,16 +3339,16 @@ xlog_pack_data_checksum( int size) { int i; - uint *up; + __be32 *up; uint chksum = 0; - up = (uint *)iclog->ic_datap; + up = (__be32 *)iclog->ic_datap; /* divide length by 4 to get # words */ for (i = 0; i < (size >> 2); i++) { - chksum ^= INT_GET(*up, ARCH_CONVERT); + chksum ^= be32_to_cpu(*up); up++; } - INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum); + iclog->ic_header.h_chksum = cpu_to_be32(chksum); } #else #define xlog_pack_data_checksum(log, iclog, size) @@ -3392,7 +3365,7 @@ xlog_pack_data( { int i, j, k; int size = iclog->ic_offset + roundoff; - uint cycle_lsn; + __be32 cycle_lsn; xfs_caddr_t dp; xlog_in_core_2_t *xhdr; @@ -3403,18 +3376,18 @@ xlog_pack_data( dp = iclog->ic_datap; for (i = 0; i < BTOBB(size) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(uint *)dp; - *(uint *)dp = cycle_lsn; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; dp += BBSIZE; } - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { xhdr = (xlog_in_core_2_t *)&iclog->ic_header; for ( ; i < BTOBB(size); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp; - *(uint *)dp = cycle_lsn; + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; dp += BBSIZE; } @@ -3431,24 +3404,24 @@ xlog_unpack_data_checksum( xfs_caddr_t dp, xlog_t *log) { - uint *up = (uint *)dp; + __be32 *up = (__be32 *)dp; uint chksum = 0; int i; /* divide length by 4 to get # words */ - for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) { - chksum ^= INT_GET(*up, ARCH_CONVERT); + for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) { + chksum ^= be32_to_cpu(*up); up++; } - if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) { + if (chksum != be32_to_cpu(rhead->h_chksum)) { if (rhead->h_chksum || ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { cmn_err(CE_DEBUG, "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", - INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum); + be32_to_cpu(rhead->h_chksum), chksum); cmn_err(CE_DEBUG, "XFS: Disregard message if filesystem was created with non-DEBUG kernel"); - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { cmn_err(CE_DEBUG, "XFS: LogR this is a LogV2 filesystem\n"); } @@ -3469,18 +3442,18 @@ xlog_unpack_data( int i, j, k; xlog_in_core_2_t *xhdr; - for (i = 0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) && + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(uint *)dp = *(uint *)&rhead->h_cycle_data[i]; + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; dp += BBSIZE; } - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) { + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; dp += BBSIZE; } } @@ -3496,24 +3469,21 @@ xlog_valid_rec_header( { int hlen; - if (unlikely( - (INT_GET(rhead->h_magicno, ARCH_CONVERT) != - XLOG_HEADER_MAGIC_NUM))) { + if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) { XFS_ERROR_REPORT("xlog_valid_rec_header(1)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); } if (unlikely( (!rhead->h_version || - (INT_GET(rhead->h_version, ARCH_CONVERT) & - (~XLOG_VERSION_OKBITS)) != 0))) { + (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { xlog_warn("XFS: %s: unrecognised log version (%d).", - __FUNCTION__, INT_GET(rhead->h_version, ARCH_CONVERT)); + __func__, be32_to_cpu(rhead->h_version)); return XFS_ERROR(EIO); } /* LR body must have data or it wouldn't have been written */ - hlen = INT_GET(rhead->h_len, ARCH_CONVERT); + hlen = be32_to_cpu(rhead->h_len); if (unlikely( hlen <= 0 || hlen > INT_MAX )) { XFS_ERROR_REPORT("xlog_valid_rec_header(2)", XFS_ERRLEVEL_LOW, log->l_mp); @@ -3557,7 +3527,7 @@ xlog_do_recovery_pass( * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { /* * When using variable length iclogs, read first sector of * iclog header and extract the header size from it. Get a @@ -3573,9 +3543,8 @@ xlog_do_recovery_pass( error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) goto bread_err1; - h_size = INT_GET(rhead->h_size, ARCH_CONVERT); - if ((INT_GET(rhead->h_version, ARCH_CONVERT) - & XLOG_VERSION_2) && + h_size = be32_to_cpu(rhead->h_size); + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { hblks = h_size / XLOG_HEADER_CYCLE_SIZE; if (h_size % XLOG_HEADER_CYCLE_SIZE) @@ -3612,7 +3581,7 @@ xlog_do_recovery_pass( goto bread_err2; /* blocks in data section */ - bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); error = xlog_bread(log, blk_no + hblks, bblks, dbp); if (error) goto bread_err2; @@ -3668,15 +3637,19 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ + wrapped_hblks = hblks - split_hblks; bufaddr = XFS_BUF_PTR(hbp); - XFS_BUF_SET_PTR(hbp, + error = XFS_BUF_SET_PTR(hbp, bufaddr + BBTOB(split_hblks), BBTOB(hblks - split_hblks)); - wrapped_hblks = hblks - split_hblks; - error = xlog_bread(log, 0, wrapped_hblks, hbp); + if (!error) + error = xlog_bread(log, 0, + wrapped_hblks, hbp); + if (!error) + error = XFS_BUF_SET_PTR(hbp, bufaddr, + BBTOB(hblks)); if (error) goto bread_err2; - XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks)); if (!offset) offset = xlog_align(log, 0, wrapped_hblks, hbp); @@ -3687,7 +3660,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; /* Read in data for log record */ @@ -3728,13 +3701,18 @@ xlog_do_recovery_pass( * - order is important. */ bufaddr = XFS_BUF_PTR(dbp); - XFS_BUF_SET_PTR(dbp, + error = XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks), BBTOB(bblks - split_bblks)); - if ((error = xlog_bread(log, wrapped_hblks, - bblks - split_bblks, dbp))) + if (!error) + error = xlog_bread(log, wrapped_hblks, + bblks - split_bblks, + dbp); + if (!error) + error = XFS_BUF_SET_PTR(dbp, bufaddr, + h_size); + if (error) goto bread_err2; - XFS_BUF_SET_PTR(dbp, bufaddr, h_size); if (!offset) offset = xlog_align(log, wrapped_hblks, bblks - split_bblks, dbp); @@ -3758,7 +3736,7 @@ xlog_do_recovery_pass( error = xlog_valid_rec_header(log, rhead, blk_no); if (error) goto bread_err2; - bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) goto bread_err2; offset = xlog_align(log, blk_no+hblks, bblks, dbp); @@ -3811,8 +3789,7 @@ xlog_do_log_recovery( error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { - kmem_free(log->l_buf_cancel_table, - XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; } @@ -3831,8 +3808,7 @@ xlog_do_log_recovery( } #endif /* DEBUG */ - kmem_free(log->l_buf_cancel_table, - XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; @@ -3885,9 +3861,13 @@ xlog_do_recover( */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); + ASSERT(!(XFS_BUF_ISWRITE(bp))); + ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); + XFS_BUF_UNASYNC(bp); xfsbdstrat(log->l_mp, bp); - if ((error = xfs_iowait(bp))) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xlog_do_recover", log->l_mp, bp, XFS_BUF_ADDR(bp)); ASSERT(0); @@ -3897,11 +3877,14 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, XFS_SB_ALL_BITS); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); - ASSERT(XFS_SB_GOOD_VERSION(sbp)); + ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); + /* We've re-read the superblock so re-initialize per-cpu counters */ + xfs_icsb_reinit_counters(log->l_mp); + xlog_recover_check_summary(log); /* Normal transactions can now occur */ @@ -3937,8 +3920,7 @@ xlog_recover( * under the vfs layer, so we can get away with it unless * the device itself is read-only, in which case we fail. */ - if ((error = xfs_dev_is_read_only(log->l_mp, - "recovery required"))) { + if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { return error; } @@ -3964,8 +3946,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log, - int mfsi_flags) + xlog_t *log) { /* * Now we're ready to do the transactions needed for the @@ -3976,7 +3957,14 @@ xlog_recover_finish( * rather than accepting new requests. */ if (log->l_flags & XLOG_RECOVERY_NEEDED) { - xlog_recover_process_efis(log); + int error; + error = xlog_recover_process_efis(log); + if (error) { + cmn_err(CE_ALERT, + "Failed to recover EFIs on filesystem: %s", + log->l_mp->m_fsname); + return error; + } /* * Sync the log to get all the EFIs out of the AIL. * This isn't absolutely necessary, but it helps in @@ -3986,9 +3974,7 @@ xlog_recover_finish( xfs_log_force(log->l_mp, (xfs_lsn_t)0, (XFS_LOG_FORCE | XFS_LOG_SYNC)); - if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) { - xlog_recover_process_iunlinks(log); - } + xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); @@ -4073,7 +4059,7 @@ xlog_recover_check_summary( sbbp = xfs_getsb(mp, 0); #ifdef XFS_LOUD_RECOVERY sbp = &mp->m_sb; - xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, XFS_SB_ALL_BITS); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp)); cmn_err(CE_NOTE, "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", sbp->sb_icount, itotal); diff --git a/fs/xfs/xfs_mac.h b/fs/xfs/xfs_mac.h deleted file mode 100644 index 18e0e98..0000000 --- a/fs/xfs/xfs_mac.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_MAC_H__ -#define __XFS_MAC_H__ - -/* - * Mandatory Access Control - * - * Layout of a composite MAC label: - * ml_list contains the list of categories (MSEN) followed by the list of - * divisions (MINT). This is actually a header for the data structure which - * will have an ml_list with more than one element. - * - * ------------------------------- - * | ml_msen_type | ml_mint_type | - * ------------------------------- - * | ml_level | ml_grade | - * ------------------------------- - * | ml_catcount | - * ------------------------------- - * | ml_divcount | - * ------------------------------- - * | category 1 | - * | . . . | - * | category N | (where N = ml_catcount) - * ------------------------------- - * | division 1 | - * | . . . | - * | division M | (where M = ml_divcount) - * ------------------------------- - */ -#define XFS_MAC_MAX_SETS 250 -typedef struct xfs_mac_label { - __uint8_t ml_msen_type; /* MSEN label type */ - __uint8_t ml_mint_type; /* MINT label type */ - __uint8_t ml_level; /* Hierarchical level */ - __uint8_t ml_grade; /* Hierarchical grade */ - __uint16_t ml_catcount; /* Category count */ - __uint16_t ml_divcount; /* Division count */ - /* Category set, then Division set */ - __uint16_t ml_list[XFS_MAC_MAX_SETS]; -} xfs_mac_label_t; - -/* MSEN label type names. Choose an upper case ASCII character. */ -#define XFS_MSEN_ADMIN_LABEL 'A' /* Admin: low<admin != tcsec<high */ -#define XFS_MSEN_EQUAL_LABEL 'E' /* Wildcard - always equal */ -#define XFS_MSEN_HIGH_LABEL 'H' /* System High - always dominates */ -#define XFS_MSEN_MLD_HIGH_LABEL 'I' /* System High, multi-level dir */ -#define XFS_MSEN_LOW_LABEL 'L' /* System Low - always dominated */ -#define XFS_MSEN_MLD_LABEL 'M' /* TCSEC label on a multi-level dir */ -#define XFS_MSEN_MLD_LOW_LABEL 'N' /* System Low, multi-level dir */ -#define XFS_MSEN_TCSEC_LABEL 'T' /* TCSEC label */ -#define XFS_MSEN_UNKNOWN_LABEL 'U' /* unknown label */ - -/* MINT label type names. Choose a lower case ASCII character. */ -#define XFS_MINT_BIBA_LABEL 'b' /* Dual of a TCSEC label */ -#define XFS_MINT_EQUAL_LABEL 'e' /* Wildcard - always equal */ -#define XFS_MINT_HIGH_LABEL 'h' /* High Grade - always dominates */ -#define XFS_MINT_LOW_LABEL 'l' /* Low Grade - always dominated */ - -/* On-disk XFS extended attribute names */ -#define SGI_MAC_FILE "SGI_MAC_FILE" -#define SGI_MAC_FILE_SIZE (sizeof(SGI_MAC_FILE)-1) - - -#ifdef __KERNEL__ - -#ifdef CONFIG_FS_POSIX_MAC - -/* NOT YET IMPLEMENTED */ - -#define MACEXEC 00100 -#define MACWRITE 00200 -#define MACREAD 00400 - -struct xfs_inode; -extern int xfs_mac_iaccess(struct xfs_inode *, mode_t, cred_t *); - -#define _MAC_XFS_IACCESS(i,m,c) (xfs_mac_iaccess(i,m,c)) -#define _MAC_VACCESS(v,c,m) (xfs_mac_vaccess(v,c,m)) -#define _MAC_EXISTS xfs_mac_vhaslabel - -#else -#define _MAC_XFS_IACCESS(i,m,c) (0) -#define _MAC_VACCESS(v,c,m) (0) -#define _MAC_EXISTS (NULL) -#endif - -#endif /* __KERNEL__ */ - -#endif /* __XFS_MAC_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9dfae18..15f5dd2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,30 +43,27 @@ #include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_fsops.h" +#include "xfs_utils.h" -STATIC void xfs_mount_log_sbunit(xfs_mount_t *, __int64_t); +STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t); STATIC int xfs_uuid_mount(xfs_mount_t *); -STATIC void xfs_uuid_unmount(xfs_mount_t *mp); STATIC void xfs_unmountfs_wait(xfs_mount_t *); #ifdef HAVE_PERCPU_SB -STATIC void xfs_icsb_destroy_counters(xfs_mount_t *); -STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); -STATIC void xfs_icsb_sync_counters(xfs_mount_t *); +STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, + int); +STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, + int); STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, - int, int); -STATIC int xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t, - int, int); -STATIC int xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); + int64_t, int); +STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #else -#define xfs_icsb_destroy_counters(mp) do { } while (0) #define xfs_icsb_balance_counter(mp, a, b) do { } while (0) -#define xfs_icsb_sync_counters(mp) do { } while (0) +#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) #define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) -#define xfs_icsb_modify_counters_locked(mp, a, b, c) do { } while (0) #endif @@ -121,90 +118,51 @@ static const struct { { offsetof(xfs_sb_t, sb_logsectsize),0 }, { offsetof(xfs_sb_t, sb_logsunit), 0 }, { offsetof(xfs_sb_t, sb_features2), 0 }, + { offsetof(xfs_sb_t, sb_bad_features2), 0 }, { sizeof(xfs_sb_t), 0 } }; /* - * Return a pointer to an initialized xfs_mount structure. - */ -xfs_mount_t * -xfs_mount_init(void) -{ - xfs_mount_t *mp; - - mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP); - - if (xfs_icsb_init_counters(mp)) { - mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; - } - - AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail"); - spinlock_init(&mp->m_sb_lock, "xfs_sb"); - mutex_init(&mp->m_ilock); - initnsema(&mp->m_growlock, 1, "xfs_grow"); - /* - * Initialize the AIL. - */ - xfs_trans_ail_init(mp); - - atomic_set(&mp->m_active_trans, 0); - - return mp; -} - -/* * Free up the resources associated with a mount structure. Assume that * the structure was initially zeroed, so we can tell which fields got * initialized. */ -void -xfs_mount_free( - xfs_mount_t *mp, - int remove_bhv) +STATIC void +xfs_free_perag( + xfs_mount_t *mp) { - if (mp->m_ihash) - xfs_ihash_free(mp); - if (mp->m_chash) - xfs_chash_free(mp); - if (mp->m_perag) { int agno; for (agno = 0; agno < mp->m_maxagi; agno++) if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list, - sizeof(xfs_perag_busy_t) * - XFS_PAGB_NUM_SLOTS); - kmem_free(mp->m_perag, - sizeof(xfs_perag_t) * mp->m_sb.sb_agcount); + kmem_free(mp->m_perag[agno].pagb_list); + kmem_free(mp->m_perag); } +} - AIL_LOCK_DESTROY(&mp->m_ail_lock); - spinlock_destroy(&mp->m_sb_lock); - mutex_destroy(&mp->m_ilock); - freesema(&mp->m_growlock); - if (mp->m_quotainfo) - XFS_QM_DONE(mp); - - if (mp->m_fsname != NULL) - kmem_free(mp->m_fsname, mp->m_fsname_len); - if (mp->m_rtname != NULL) - kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1); - if (mp->m_logname != NULL) - kmem_free(mp->m_logname, strlen(mp->m_logname) + 1); - - if (remove_bhv) { - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); - - bhv_remove_all_vfsops(vfsp, 0); - VFS_REMOVEBHV(vfsp, &mp->m_bhv); - } +/* + * Check size of device based on the (data/realtime) block count. + * Note: this check is used by the growfs code as well as mount. + */ +int +xfs_sb_validate_fsb_count( + xfs_sb_t *sbp, + __uint64_t nblocks) +{ + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); + ASSERT(sbp->sb_blocklog >= BBSHIFT); - xfs_icsb_destroy_counters(mp); - kmem_free(mp, sizeof(xfs_mount_t)); +#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ + if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + return E2BIG; +#else /* Limited by UINT_MAX of sectors */ + if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) + return E2BIG; +#endif + return 0; } - /* * Check the validity of the SB found. */ @@ -226,7 +184,7 @@ xfs_mount_validate_sb( return XFS_ERROR(EWRONGFS); } - if (!XFS_SB_GOOD_VERSION(sbp)) { + if (!xfs_sb_good_version(sbp)) { xfs_fs_mount_cmn_err(flags, "bad version"); return XFS_ERROR(EWRONGFS); } @@ -286,18 +244,21 @@ xfs_mount_validate_sb( return XFS_ERROR(EFSCORRUPTED); } - ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); - ASSERT(sbp->sb_blocklog >= BBSHIFT); + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + xfs_fs_mount_cmn_err(flags, + "file system with blocksize %d bytes", + sbp->sb_blocksize); + xfs_fs_mount_cmn_err(flags, + "only pagesize (%ld) or less will currently work.", + PAGE_SIZE); + return XFS_ERROR(ENOSYS); + } -#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ - if (unlikely( - (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX || - (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) { -#else /* Limited by UINT_MAX of sectors */ - if (unlikely( - (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || - (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { -#endif + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_fs_mount_cmn_err(flags, "file system too large to be mounted on this system."); return XFS_ERROR(E2BIG); @@ -311,31 +272,28 @@ xfs_mount_validate_sb( /* * Version 1 directory format has never worked on Linux. */ - if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) { + if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { xfs_fs_mount_cmn_err(flags, "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ - if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - xfs_fs_mount_cmn_err(flags, - "file system with blocksize %d bytes", - sbp->sb_blocksize); - xfs_fs_mount_cmn_err(flags, - "only pagesize (%ld) or less will currently work.", - PAGE_SIZE); - return XFS_ERROR(ENOSYS); - } - return 0; } +STATIC void +xfs_initialize_perag_icache( + xfs_perag_t *pag) +{ + if (!pag->pag_ici_init) { + rwlock_init(&pag->pag_ici_lock); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + pag->pag_ici_init = 1; + } +} + xfs_agnumber_t xfs_initialize_perag( - bhv_vfs_t *vfs, xfs_mount_t *mp, xfs_agnumber_t agcount) { @@ -353,7 +311,7 @@ xfs_initialize_perag( /* Clear the mount flag if no inode can overflow 32 bits * on this filesystem, or if specifically requested.. */ - if ((vfs->vfs_flag & VFS_32BITINODES) && ino > max_inum) { + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) { mp->m_flags |= XFS_MOUNT_32BITINODES; } else { mp->m_flags &= ~XFS_MOUNT_32BITINODES; @@ -387,48 +345,93 @@ xfs_initialize_perag( pag->pagi_inodeok = 1; if (index < max_metadata) pag->pagf_metadata = 1; + xfs_initialize_perag_icache(pag); } } else { /* Setup default behavior for smaller filesystems */ for (index = 0; index < agcount; index++) { pag = &mp->m_perag[index]; pag->pagi_inodeok = 1; + xfs_initialize_perag_icache(pag); } } return index; } +void +xfs_sb_from_disk( + xfs_sb_t *to, + xfs_dsb_t *from) +{ + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); + to->sb_blocksize = be32_to_cpu(from->sb_blocksize); + to->sb_dblocks = be64_to_cpu(from->sb_dblocks); + to->sb_rblocks = be64_to_cpu(from->sb_rblocks); + to->sb_rextents = be64_to_cpu(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = be64_to_cpu(from->sb_logstart); + to->sb_rootino = be64_to_cpu(from->sb_rootino); + to->sb_rbmino = be64_to_cpu(from->sb_rbmino); + to->sb_rsumino = be64_to_cpu(from->sb_rsumino); + to->sb_rextsize = be32_to_cpu(from->sb_rextsize); + to->sb_agblocks = be32_to_cpu(from->sb_agblocks); + to->sb_agcount = be32_to_cpu(from->sb_agcount); + to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); + to->sb_logblocks = be32_to_cpu(from->sb_logblocks); + to->sb_versionnum = be16_to_cpu(from->sb_versionnum); + to->sb_sectsize = be16_to_cpu(from->sb_sectsize); + to->sb_inodesize = be16_to_cpu(from->sb_inodesize); + to->sb_inopblock = be16_to_cpu(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = be64_to_cpu(from->sb_icount); + to->sb_ifree = be64_to_cpu(from->sb_ifree); + to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); + to->sb_frextents = be64_to_cpu(from->sb_frextents); + to->sb_uquotino = be64_to_cpu(from->sb_uquotino); + to->sb_gquotino = be64_to_cpu(from->sb_gquotino); + to->sb_qflags = be16_to_cpu(from->sb_qflags); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); + to->sb_unit = be32_to_cpu(from->sb_unit); + to->sb_width = be32_to_cpu(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); + to->sb_logsunit = be32_to_cpu(from->sb_logsunit); + to->sb_features2 = be32_to_cpu(from->sb_features2); + to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); +} + /* - * xfs_xlatesb + * Copy in core superblock to ondisk one. * - * data - on disk version of sb - * sb - a superblock - * dir - conversion direction: <0 - convert sb to buf - * >0 - convert buf to sb - * fields - which fields to copy (bitmask) + * The fields argument is mask of superblock fields to copy. */ void -xfs_xlatesb( - void *data, - xfs_sb_t *sb, - int dir, +xfs_sb_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, __int64_t fields) { - xfs_caddr_t buf_ptr; - xfs_caddr_t mem_ptr; + xfs_caddr_t to_ptr = (xfs_caddr_t)to; + xfs_caddr_t from_ptr = (xfs_caddr_t)from; xfs_sb_field_t f; int first; int size; - ASSERT(dir); ASSERT(fields); - if (!fields) return; - buf_ptr = (xfs_caddr_t)data; - mem_ptr = (xfs_caddr_t)sb; - while (fields) { f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); first = xfs_sb_info[f].offset; @@ -437,26 +440,20 @@ xfs_xlatesb( ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); if (size == 1 || xfs_sb_info[f].type == 1) { - if (dir > 0) { - memcpy(mem_ptr + first, buf_ptr + first, size); - } else { - memcpy(buf_ptr + first, mem_ptr + first, size); - } + memcpy(to_ptr + first, from_ptr + first, size); } else { switch (size) { case 2: - INT_XLATE(*(__uint16_t*)(buf_ptr+first), - *(__uint16_t*)(mem_ptr+first), - dir, ARCH_CONVERT); + *(__be16 *)(to_ptr + first) = + cpu_to_be16(*(__u16 *)(from_ptr + first)); break; case 4: - INT_XLATE(*(__uint32_t*)(buf_ptr+first), - *(__uint32_t*)(mem_ptr+first), - dir, ARCH_CONVERT); + *(__be32 *)(to_ptr + first) = + cpu_to_be32(*(__u32 *)(from_ptr + first)); break; case 8: - INT_XLATE(*(__uint64_t*)(buf_ptr+first), - *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT); + *(__be64 *)(to_ptr + first) = + cpu_to_be64(*(__u64 *)(from_ptr + first)); break; default: ASSERT(0); @@ -478,7 +475,6 @@ xfs_readsb(xfs_mount_t *mp, int flags) unsigned int sector_size; unsigned int extra_flags; xfs_buf_t *bp; - xfs_sb_t *sbp; int error; ASSERT(mp->m_sb_bp == NULL); @@ -506,8 +502,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) * Initialize the mount structure from the superblock. * But first do some basic consistency checking. */ - sbp = XFS_BUF_TO_SBP(bp); - xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS); + xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); if (error) { @@ -545,9 +540,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); } - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + /* Initialize per-cpu counters */ + xfs_icsb_reinit_counters(mp); mp->m_sb_bp = bp; xfs_buf_relse(bp); @@ -576,7 +570,7 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) int i; mp->m_agfrotor = mp->m_agirotor = 0; - spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock"); + spin_lock_init(&mp->m_agirotor_lock); mp->m_maxagi = mp->m_sb.sb_agcount; mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; @@ -635,52 +629,72 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; } + /* - * xfs_mountfs + * xfs_initialize_perag_data * - * This function does the following on an initial mount of a file system: - * - reads the superblock from disk and init the mount struct - * - if we're a 32-bit kernel, do a size check on the superblock - * so we don't mount terabyte filesystems - * - init mount struct realtime fields - * - allocate inode hash table for fs - * - init directory manager - * - perform recovery and init the log manager + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. */ -int -xfs_mountfs( - bhv_vfs_t *vfsp, - xfs_mount_t *mp, - int mfsi_flags) +STATIC int +xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) { - xfs_buf_t *bp; - xfs_sb_t *sbp = &(mp->m_sb); - xfs_inode_t *rip; - bhv_vnode_t *rvp = NULL; - int readio_log, writeio_log; - xfs_daddr_t d; - __uint64_t ret64; - __int64_t update_flags; - uint quotamount, quotaflags; - int agno; - int uuid_mounted = 0; - int error = 0; + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; - if (mp->m_sb_bp == NULL) { - if ((error = xfs_readsb(mp, mfsi_flags))) { + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the inforamtion we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) return error; - } - } - xfs_mount_common(mp, sbp); + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = &mp->m_perag[index]; + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * Overwrite incore superblock counters with just-read data */ - update_flags = 0LL; - if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) { + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + spin_unlock(&mp->m_sb_lock); + + /* Fixup the per-cpu counters as well. */ + xfs_icsb_reinit_counters(mp); + + return 0; +} + +/* + * Update alignment values based on mount options and sb values + */ +STATIC int +xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) +{ + xfs_sb_t *sbp = &(mp->m_sb); + + if (mp->m_dalign) { /* * If stripe unit and stripe width are not multiples * of the fs blocksize turn off alignment. @@ -690,8 +704,7 @@ xfs_mountfs( if (mp->m_flags & XFS_MOUNT_RETERR) { cmn_err(CE_WARN, "XFS: alignment check 1 failed"); - error = XFS_ERROR(EINVAL); - goto error1; + return XFS_ERROR(EINVAL); } mp->m_dalign = mp->m_swidth = 0; } else { @@ -701,8 +714,7 @@ xfs_mountfs( mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { if (mp->m_flags & XFS_MOUNT_RETERR) { - error = XFS_ERROR(EINVAL); - goto error1; + return XFS_ERROR(EINVAL); } xfs_fs_cmn_err(CE_WARN, mp, "stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", @@ -719,8 +731,7 @@ xfs_mountfs( "stripe alignment turned off: sunit(%d) less than bsize(%d)", mp->m_dalign, mp->m_blockmask +1); - error = XFS_ERROR(EINVAL); - goto error1; + return XFS_ERROR(EINVAL); } mp->m_swidth = 0; } @@ -730,67 +741,61 @@ xfs_mountfs( * Update superblock with new values * and log changes */ - if (XFS_SB_VERSION_HASDALIGN(sbp)) { + if (xfs_sb_version_hasdalign(sbp)) { if (sbp->sb_unit != mp->m_dalign) { sbp->sb_unit = mp->m_dalign; - update_flags |= XFS_SB_UNIT; + *update_flags |= XFS_SB_UNIT; } if (sbp->sb_width != mp->m_swidth) { sbp->sb_width = mp->m_swidth; - update_flags |= XFS_SB_WIDTH; + *update_flags |= XFS_SB_WIDTH; } } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && - XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) { + xfs_sb_version_hasdalign(&mp->m_sb)) { mp->m_dalign = sbp->sb_unit; mp->m_swidth = sbp->sb_width; } - xfs_alloc_compute_maxlevels(mp); - xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); - xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); - xfs_ialloc_compute_maxlevels(mp); + return 0; +} - if (sbp->sb_imax_pct) { - __uint64_t icount; +/* + * Set the maximum inode count for this filesystem + */ +STATIC void +xfs_set_maxicount(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + __uint64_t icount; - /* Make sure the maximum inode count is a multiple of the - * units we allocate inodes in. + if (sbp->sb_imax_pct) { + /* + * Make sure the maximum inode count is a multiple + * of the units we allocate inodes in. */ - icount = sbp->sb_dblocks * sbp->sb_imax_pct; do_div(icount, 100); do_div(icount, mp->m_ialloc_blks); mp->m_maxicount = (icount * mp->m_ialloc_blks) << sbp->sb_inopblog; - } else + } else { mp->m_maxicount = 0; - - mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); - - /* - * XFS uses the uuid from the superblock as the unique - * identifier for fsid. We can not use the uuid from the volume - * since a single partition filesystem is identical to a single - * partition volume/filesystem. - */ - if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && - (mp->m_flags & XFS_MOUNT_NOUUID) == 0) { - if (xfs_uuid_mount(mp)) { - error = XFS_ERROR(EINVAL); - goto error1; - } - uuid_mounted=1; - ret64 = uuid_hash64(&sbp->sb_uuid); - memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64)); } +} + +/* + * Set the default minimum read and write sizes unless + * already specified in a mount option. + * We use smaller I/O sizes when the file system + * is being used for NFS service (wsync mount option). + */ +STATIC void +xfs_set_rw_sizes(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + int readio_log, writeio_log; - /* - * Set the default minimum read and write sizes unless - * already specified in a mount option. - * We use smaller I/O sizes when the file system - * is being used for NFS service (wsync mount option). - */ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { if (mp->m_flags & XFS_MOUNT_WSYNC) { readio_log = XFS_WSYNC_READIO_LOG; @@ -804,16 +809,6 @@ xfs_mountfs( writeio_log = mp->m_writeio_log; } - /* - * Set the number of readahead buffers to use based on - * physical memory size. - */ - if (xfs_physmem <= 4096) /* <= 16MB */ - mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB; - else if (xfs_physmem <= 8192) /* <= 32MB */ - mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB; - else - mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32; if (sbp->sb_blocklog > readio_log) { mp->m_readio_log = sbp->sb_blocklog; } else { @@ -826,21 +821,15 @@ xfs_mountfs( mp->m_writeio_log = writeio_log; } mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); +} - /* - * Set the inode cluster size based on the physical memory - * size. This may still be overridden by the file system - * block size if it is larger than the chosen cluster size. - */ - if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */ - mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE; - } else { - mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; - } - /* - * Set whether we're using inode alignment. - */ - if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) && +/* + * Set whether we're using inode alignment. + */ +STATIC void +xfs_set_inoalignment(xfs_mount_t *mp) +{ + if (xfs_sb_version_hasalign(&mp->m_sb) && mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; @@ -855,14 +844,22 @@ xfs_mountfs( mp->m_sinoalign = mp->m_dalign; else mp->m_sinoalign = 0; - /* - * Check that the data (and log if separate) are an ok size. - */ +} + +/* + * Check that the data (and log if separate) are an ok size. + */ +STATIC int +xfs_check_sizes(xfs_mount_t *mp) +{ + xfs_buf_t *bp; + xfs_daddr_t d; + int error; + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { cmn_err(CE_WARN, "XFS: size check 1 failed"); - error = XFS_ERROR(E2BIG); - goto error1; + return XFS_ERROR(E2BIG); } error = xfs_read_buf(mp, mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), @@ -871,19 +868,16 @@ xfs_mountfs( xfs_buf_relse(bp); } else { cmn_err(CE_WARN, "XFS: size check 2 failed"); - if (error == ENOSPC) { + if (error == ENOSPC) error = XFS_ERROR(E2BIG); - } - goto error1; + return error; } - if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) && - mp->m_logdev_targp != mp->m_ddev_targp) { + if (mp->m_logdev_targp != mp->m_ddev_targp) { d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { cmn_err(CE_WARN, "XFS: size check 3 failed"); - error = XFS_ERROR(E2BIG); - goto error1; + return XFS_ERROR(E2BIG); } error = xfs_read_buf(mp, mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), @@ -892,26 +886,146 @@ xfs_mountfs( xfs_buf_relse(bp); } else { cmn_err(CE_WARN, "XFS: size check 3 failed"); - if (error == ENOSPC) { + if (error == ENOSPC) error = XFS_ERROR(E2BIG); - } - goto error1; + return error; } } + return 0; +} + +/* + * xfs_mountfs + * + * This function does the following on an initial mount of a file system: + * - reads the superblock from disk and init the mount struct + * - if we're a 32-bit kernel, do a size check on the superblock + * so we don't mount terabyte filesystems + * - init mount struct realtime fields + * - allocate inode hash table for fs + * - init directory manager + * - perform recovery and init the log manager + */ +int +xfs_mountfs( + xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + xfs_inode_t *rip; + __uint64_t resblks; + __int64_t update_flags = 0LL; + uint quotamount, quotaflags; + int uuid_mounted = 0; + int error = 0; + + xfs_mount_common(mp, sbp); /* - * Initialize realtime fields in the mount structure + * Check for a mismatched features2 values. Older kernels + * read & wrote into the wrong sb offset for sb_features2 + * on some platforms due to xfs_sb_t not being 64bit size aligned + * when sb_features2 was added, which made older superblock + * reading/writing routines swap it as a 64-bit value. + * + * For backwards compatibility, we make both slots equal. + * + * If we detect a mismatched field, we OR the set bits into the + * existing features2 field in case it has already been modified; we + * don't want to lose any features. We then update the bad location + * with the ORed value so that older kernels will see any features2 + * flags, and mark the two fields as needing updates once the + * transaction subsystem is online. */ - if ((error = xfs_rtmount_init(mp))) { - cmn_err(CE_WARN, "XFS: RT mount failed"); + if (xfs_sb_has_mismatched_features2(sbp)) { + cmn_err(CE_WARN, + "XFS: correcting sb_features alignment problem"); + sbp->sb_features2 |= sbp->sb_bad_features2; + sbp->sb_bad_features2 = sbp->sb_features2; + update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; + + /* + * Re-check for ATTR2 in case it was found in bad_features2 + * slot. + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + } + + if (xfs_sb_version_hasattr2(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_sb_version_removeattr2(&mp->m_sb); + update_flags |= XFS_SB_FEATURES2; + + /* update sb_versionnum for the clearing of the morebits */ + if (!sbp->sb_features2) + update_flags |= XFS_SB_VERSIONNUM; + } + + /* + * Check if sb_agblocks is aligned at stripe boundary + * If sb_agblocks is NOT aligned turn off m_dalign since + * allocator alignment is within an ag, therefore ag has + * to be aligned at stripe boundary. + */ + error = xfs_update_alignment(mp, &update_flags); + if (error) goto error1; + + xfs_alloc_compute_maxlevels(mp); + xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); + xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); + xfs_ialloc_compute_maxlevels(mp); + + xfs_set_maxicount(mp); + + mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); + + /* + * XFS uses the uuid from the superblock as the unique + * identifier for fsid. We can not use the uuid from the volume + * since a single partition filesystem is identical to a single + * partition volume/filesystem. + */ + if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) { + if (xfs_uuid_mount(mp)) { + error = XFS_ERROR(EINVAL); + goto error1; + } + uuid_mounted=1; } /* - * For client case we are done now + * Set the minimum read and write sizes */ - if (mfsi_flags & XFS_MFSI_CLIENT) { - return 0; + xfs_set_rw_sizes(mp); + + /* + * Set the inode cluster size. + * This may still be overridden by the file system + * block size if it is larger than the chosen cluster size. + */ + mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + + /* + * Set inode alignment fields + */ + xfs_set_inoalignment(mp); + + /* + * Check that the data (and log if separate) are an ok size. + */ + error = xfs_check_sizes(mp); + if (error) + goto error1; + + /* + * Initialize realtime fields in the mount structure + */ + error = xfs_rtmount_init(mp); + if (error) { + cmn_err(CE_WARN, "XFS: RT mount failed"); + goto error1; } /* @@ -920,16 +1034,6 @@ xfs_mountfs( */ uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); - /* - * The vfs structure needs to have a file system independent - * way of checking for the invariant file system ID. Since it - * can't look at mount structures it has a pointer to the data - * in the mount structure. - * - * File systems that don't support user level file handles (i.e. - * all of them except for XFS) will leave vfs_altfsid as NULL. - */ - vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid; mp->m_dmevmask = 0; /* not persistent; set after each mount */ xfs_dir_mount(mp); @@ -945,20 +1049,15 @@ xfs_mountfs( xfs_trans_init(mp); /* - * Allocate and initialize the inode hash table for this - * file system. - */ - xfs_ihash_init(mp); - xfs_chash_init(mp); - - /* * Allocate and initialize the per-ag data. */ init_rwsem(&mp->m_peraglock); - mp->m_perag = - kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP); + mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), + KM_MAYFAIL); + if (!mp->m_perag) + goto error1; - mp->m_maxagi = xfs_initialize_perag(vfsp, mp, sbp->sb_agcount); + mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); /* * log's mount-time initialization. Perform 1st part recovery if needed @@ -979,6 +1078,34 @@ xfs_mountfs( } /* + * Now the log is mounted, we know if it was an unclean shutdown or + * not. If it was, with the first phase of recovery has completed, we + * have consistent AG blocks on disk. We have not recovered EFIs yet, + * but they are recovered transactionally in the second recovery phase + * later. + * + * Hence we can safely re-initialise incore superblock counters from + * the per-ag data. These may not be correct if the filesystem was not + * cleanly unmounted, so we need to wait for recovery to finish before + * doing this. + * + * If the filesystem was cleanly unmounted, then we can trust the + * values in the superblock to be correct and we don't need to do + * anything here. + * + * If we are currently making the filesystem, the initialisation will + * fail as the perag data is in an undefined state. + */ + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + !mp->m_sb.sb_inprogress) { + error = xfs_initialize_perag_data(mp, sbp->sb_agcount); + if (error) { + goto error2; + } + } + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. */ @@ -989,7 +1116,6 @@ xfs_mountfs( } ASSERT(rip != NULL); - rvp = XFS_ITOV(rip); if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { cmn_err(CE_WARN, "XFS: corrupted root inode"); @@ -1009,7 +1135,8 @@ xfs_mountfs( /* * Initialize realtime inode pointers in the mount structure */ - if ((error = xfs_rtmount_inodes(mp))) { + error = xfs_rtmount_inodes(mp); + if (error) { /* * Free up the root inode. */ @@ -1018,16 +1145,21 @@ xfs_mountfs( } /* - * If fs is not mounted readonly, then update the superblock - * unit and width changes. + * If fs is not mounted readonly, then update the superblock changes. */ - if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY)) - xfs_mount_log_sbunit(mp, update_flags); + if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_mount_log_sb(mp, update_flags); + if (error) { + cmn_err(CE_WARN, "XFS: failed to write sb changes"); + goto error4; + } + } /* * Initialise the XFS quota management subsystem for this mount */ - if ((error = XFS_QM_INIT(mp, "amount, "aflags))) + error = XFS_QM_INIT(mp, "amount, "aflags); + if (error) goto error4; /* @@ -1035,7 +1167,7 @@ xfs_mountfs( * delayed until after the root and real-time bitmap inodes * were consistently read in. */ - error = xfs_log_mount_finish(mp, mfsi_flags); + error = xfs_log_mount_finish(mp); if (error) { cmn_err(CE_WARN, "XFS: log mount finish failed"); goto error4; @@ -1044,53 +1176,78 @@ xfs_mountfs( /* * Complete the quota initialisation, post-log-replay component. */ - if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags))) + error = XFS_QM_MOUNT(mp, quotamount, quotaflags); + if (error) goto error4; + /* + * Now we are mounted, reserve a small amount of unused space for + * privileged transactions. This is needed so that transaction + * space required for critical operations can dip into this pool + * when at ENOSPC. This is needed for operations like create with + * attr, unwritten extent conversion at ENOSPC, etc. Data allocations + * are not allowed to use this reserved space. + * + * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. + */ + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 1024); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " + "Continuing without a reserve pool."); + return 0; error4: /* * Free up the root inode. */ - VN_RELE(rvp); + IRELE(rip); error3: xfs_log_unmount_dealloc(mp); error2: - xfs_ihash_free(mp); - xfs_chash_free(mp); - for (agno = 0; agno < sbp->sb_agcount; agno++) - if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list, - sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS); - kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t)); - mp->m_perag = NULL; - /* FALLTHROUGH */ + xfs_free_perag(mp); error1: if (uuid_mounted) - xfs_uuid_unmount(mp); - xfs_freesb(mp); + uuid_table_remove(&mp->m_sb.sb_uuid); return error; } /* - * xfs_unmountfs - * * This flushes out the inodes,dquots and the superblock, unmounts the * log and makes sure that incore structures are freed. */ -int -xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) +void +xfs_unmountfs( + struct xfs_mount *mp) { - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); -#if defined(DEBUG) || defined(INDUCE_IO_ERROR) - int64_t fsid; -#endif + __uint64_t resblks; + int error; + + IRELE(mp->m_rootip); + /* + * We can potentially deadlock here if we have an inode cluster + * that has been freed has it's buffer still pinned in memory because + * the transaction is still sitting in a iclog. The stale inodes + * on that buffer will have their flush locks held until the + * transaction hits the disk and the callbacks run. the inode + * flush takes the flush lock unconditionally and with nothing to + * push out the iclog we will never get that unlocked. hence we + * need to force the log first. + */ + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); xfs_iflush_all(mp); XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); + if (mp->m_quotainfo) + XFS_QM_DONE(mp); + /* * Flush out the log synchronously so that we know for sure * that nothing is pinned. This is important because bflush() @@ -1103,43 +1260,46 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) xfs_binval(mp->m_rtdev_targp); } + /* + * Unreserve any blocks we have so that when we unmount we don't account + * the reserved free space as used. This is really only necessary for + * lazy superblock counting because it trusts the incore superblock + * counters to be aboslutely correct on clean unmount. + * + * We don't bother correcting this elsewhere for lazy superblock + * counting because on mount of an unclean filesystem we reconstruct the + * correct counter value and this is irrelevant. + * + * For non-lazy counter filesystems, this doesn't matter at all because + * we only every apply deltas to the superblock and hence the incore + * value does not matter.... + */ + resblks = 0; + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. " + "Freespace may not be correct on next mount."); + + error = xfs_log_sbcount(mp, 1); + if (error) + cmn_err(CE_WARN, "XFS: Unable to update superblock counters. " + "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); - xfs_unmountfs_wait(mp); /* wait for async bufs */ - xfs_log_unmount(mp); /* Done! No more fs ops. */ - xfs_freesb(mp); - /* * All inodes from this mount point should be freed. */ ASSERT(mp->m_inodes == NULL); - xfs_unmountfs_close(mp, cr); if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) - xfs_uuid_unmount(mp); + uuid_table_remove(&mp->m_sb.sb_uuid); -#if defined(DEBUG) || defined(INDUCE_IO_ERROR) - /* - * clear all error tags on this filesystem - */ - memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t)); - xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0); +#if defined(DEBUG) + xfs_errortag_clearall(mp, 0); #endif - XFS_IODONE(vfsp); - xfs_mount_free(mp, 1); - return 0; -} - -void -xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr) -{ - if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_free_buftarg(mp->m_logdev_targp, 1); - if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp, 1); - xfs_free_buftarg(mp->m_ddev_targp, 0); + xfs_free_perag(mp); } STATIC void @@ -1153,34 +1313,97 @@ xfs_unmountfs_wait(xfs_mount_t *mp) } int +xfs_fs_writable(xfs_mount_t *mp) +{ + return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || + (mp->m_flags & XFS_MOUNT_RDONLY)); +} + +/* + * xfs_log_sbcount + * + * Called either periodically to keep the on disk superblock values + * roughly up to date or from unmount to make sure the values are + * correct on a clean unmount. + * + * Note this code can be called during the process of freezing, so + * we may need to use the transaction allocator which does not not + * block when the transaction subsystem is in its frozen state. + */ +int +xfs_log_sbcount( + xfs_mount_t *mp, + uint sync) +{ + xfs_trans_t *tp; + int error; + + if (!xfs_fs_writable(mp)) + return 0; + + xfs_icsb_sync_counters(mp, 0); + + /* + * we don't need to do this if we are updating the superblock + * counters on every modification. + */ + if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); + if (sync) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + return error; +} + +STATIC void +xfs_mark_shared_ro( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + xfs_dsb_t *sb = XFS_BUF_TO_SBP(bp); + __uint16_t version; + + if (!(sb->sb_flags & XFS_SBF_READONLY)) + sb->sb_flags |= XFS_SBF_READONLY; + + version = be16_to_cpu(sb->sb_versionnum); + if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 || + !(version & XFS_SB_VERSION_SHAREDBIT)) + version |= XFS_SB_VERSION_SHAREDBIT; + sb->sb_versionnum = cpu_to_be16(version); +} + +int xfs_unmountfs_writesb(xfs_mount_t *mp) { xfs_buf_t *sbp; - xfs_sb_t *sb; int error = 0; /* * skip superblock write if fs is read-only, or * if we are doing a forced umount. */ - sbp = xfs_getsb(mp, 0); - if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || + if (!((mp->m_flags & XFS_MOUNT_RDONLY) || XFS_FORCED_SHUTDOWN(mp))) { - xfs_icsb_sync_counters(mp); + sbp = xfs_getsb(mp, 0); /* * mark shared-readonly if desired */ - sb = XFS_BUF_TO_SBP(sbp); - if (mp->m_mk_sharedro) { - if (!(sb->sb_flags & XFS_SBF_READONLY)) - sb->sb_flags |= XFS_SBF_READONLY; - if (!XFS_SB_VERSION_HASSHARED(sb)) - XFS_SB_VERSION_ADDSHARED(sb); - xfs_fs_cmn_err(CE_NOTE, mp, - "Unmounting, marking shared read-only"); - } + if (mp->m_mk_sharedro) + xfs_mark_shared_ro(mp, sbp); + XFS_BUF_UNDONE(sbp); XFS_BUF_UNREAD(sbp); XFS_BUF_UNDELAYWRITE(sbp); @@ -1188,15 +1411,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNASYNC(sbp); ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); xfsbdstrat(mp, sbp); - /* Nevermind errors we might get here. */ error = xfs_iowait(sbp); if (error) xfs_ioerror_alert("xfs_unmountfs_writesb", mp, sbp, XFS_BUF_ADDR(sbp)); if (error && mp->m_mk_sharedro) xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); + xfs_buf_relse(sbp); } - xfs_buf_relse(sbp); return error; } @@ -1214,7 +1436,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) int first; int last; xfs_mount_t *mp; - xfs_sb_t *sbp; xfs_sb_field_t f; ASSERT(fields); @@ -1222,13 +1443,12 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) return; mp = tp->t_mountp; bp = xfs_trans_getsb(tp, mp, 0); - sbp = XFS_BUF_TO_SBP(bp); first = sizeof(xfs_sb_t); last = 0; /* translate/copy */ - xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); /* find modified range */ @@ -1251,11 +1471,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) * Fields are not allowed to dip below zero, so if the delta would * do this do not apply it and return EINVAL. * - * The SB_LOCK must be held when this routine is called. + * The m_sb_lock must be held when this routine is called. */ int -xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, - int delta, int rsvd) +xfs_mod_incore_sb_unlocked( + xfs_mount_t *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) { int scounter; /* short counter for 32 bit fields */ long long lcounter; /* long counter for 64 bit fields */ @@ -1287,7 +1510,6 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, mp->m_sb.sb_ifree = lcounter; return 0; case XFS_SBS_FDBLOCKS: - lcounter = (long long) mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); @@ -1414,13 +1636,16 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, /* * xfs_mod_incore_sb() is used to change a field in the in-core * superblock structure by the specified delta. This modification - * is protected by the SB_LOCK. Just use the xfs_mod_incore_sb_unlocked() + * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked() * routine to do the work. */ int -xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd) +xfs_mod_incore_sb( + xfs_mount_t *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) { - unsigned long s; int status; /* check for per-cpu counters */ @@ -1437,9 +1662,9 @@ xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd) /* FALLTHROUGH */ #endif default: - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); break; } @@ -1460,7 +1685,6 @@ xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd) int xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) { - unsigned long s; int status=0; xfs_mod_sb_t *msbp; @@ -1468,10 +1692,10 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) * Loop through the array of mod structures and apply each * individually. If any fail, then back out all those * which have already been applied. Do all of this within - * the scope of the SB_LOCK so that all of the changes will + * the scope of the m_sb_lock so that all of the changes will * be atomic. */ - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); msbp = &msb[0]; for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { /* @@ -1485,9 +1709,11 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) case XFS_SBS_IFREE: case XFS_SBS_FDBLOCKS: if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = xfs_icsb_modify_counters_locked(mp, + spin_unlock(&mp->m_sb_lock); + status = xfs_icsb_modify_counters(mp, msbp->msb_field, msbp->msb_delta, rsvd); + spin_lock(&mp->m_sb_lock); break; } /* FALLTHROUGH */ @@ -1521,11 +1747,12 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) case XFS_SBS_IFREE: case XFS_SBS_FDBLOCKS: if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = - xfs_icsb_modify_counters_locked(mp, + spin_unlock(&mp->m_sb_lock); + status = xfs_icsb_modify_counters(mp, msbp->msb_field, -(msbp->msb_delta), rsvd); + spin_lock(&mp->m_sb_lock); break; } /* FALLTHROUGH */ @@ -1541,7 +1768,7 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) msbp--; } } - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); return status; } @@ -1618,36 +1845,32 @@ xfs_uuid_mount( } /* - * Remove filesystem from the UUID table. - */ -STATIC void -xfs_uuid_unmount( - xfs_mount_t *mp) -{ - uuid_table_remove(&mp->m_sb.sb_uuid); -} - -/* * Used to log changes to the superblock unit and width fields which could - * be altered by the mount options. Only the first superblock is updated. + * be altered by the mount options, as well as any potential sb_features2 + * fixup. Only the first superblock is updated. */ -STATIC void -xfs_mount_log_sbunit( +STATIC int +xfs_mount_log_sb( xfs_mount_t *mp, __int64_t fields) { xfs_trans_t *tp; + int error; - ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID)); + ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID | + XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 | + XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT)) { + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); - return; + return error; } xfs_mod_sb(tp, fields); - xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); + return error; } @@ -1689,12 +1912,12 @@ xfs_mount_log_sbunit( * * Locking rules: * - * 1. XFS_SB_LOCK() before picking up per-cpu locks + * 1. m_sb_lock before picking up per-cpu locks * 2. per-cpu locks always picked up via for_each_online_cpu() order - * 3. accurate counter sync requires XFS_SB_LOCK + per cpu locks + * 3. accurate counter sync requires m_sb_lock + per cpu locks * 4. modifying per-cpu counters requires holding per-cpu lock - * 5. modifying global counters requires holding XFS_SB_LOCK - * 6. enabling or disabling a counter requires holding the XFS_SB_LOCK + * 5. modifying global counters requires holding m_sb_lock + * 6. enabling or disabling a counter requires holding the m_sb_lock * and _none_ of the per-cpu locks. * * Disabled counters are only ever re-enabled by a balance operation @@ -1721,27 +1944,32 @@ xfs_icsb_cpu_notify( { xfs_icsb_cnts_t *cntp; xfs_mount_t *mp; - int s; mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); cntp = (xfs_icsb_cnts_t *) per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: /* Easy Case - initialize the area and locks, and * then rebalance when online does everything else for us. */ memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + xfs_icsb_lock(mp); xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + xfs_icsb_unlock(mp); break; case CPU_DEAD: + case CPU_DEAD_FROZEN: /* Disable all the counters, then fold the dead cpu's * count into the total on the global superblock and * re-enable the counters. */ - s = XFS_SB_LOCK(mp); + xfs_icsb_lock(mp); + spin_lock(&mp->m_sb_lock); xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); @@ -1752,10 +1980,11 @@ xfs_icsb_cpu_notify( memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, XFS_ICSB_SB_LOCKED); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, XFS_ICSB_SB_LOCKED); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, XFS_ICSB_SB_LOCKED); - XFS_SB_UNLOCK(mp, s); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); + spin_unlock(&mp->m_sb_lock); + xfs_icsb_unlock(mp); break; } @@ -1784,6 +2013,9 @@ xfs_icsb_init_counters( cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); } + + mutex_init(&mp->m_icsb_mutex); + /* * start with all counters disabled so that the * initial balance kicks us off correctly @@ -1792,7 +2024,23 @@ xfs_icsb_init_counters( return 0; } -STATIC void +void +xfs_icsb_reinit_counters( + xfs_mount_t *mp) +{ + xfs_icsb_lock(mp); + /* + * start with all counters disabled so that the + * initial balance kicks us off correctly + */ + mp->m_icsb_counters = -1; + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + xfs_icsb_unlock(mp); +} + +void xfs_icsb_destroy_counters( xfs_mount_t *mp) { @@ -1800,9 +2048,10 @@ xfs_icsb_destroy_counters( unregister_hotcpu_notifier(&mp->m_icsb_notifier); free_percpu(mp->m_sb_cnts); } + mutex_destroy(&mp->m_icsb_mutex); } -STATIC inline void +STATIC_INLINE void xfs_icsb_lock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -1811,7 +2060,7 @@ xfs_icsb_lock_cntr( } } -STATIC inline void +STATIC_INLINE void xfs_icsb_unlock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -1819,7 +2068,7 @@ xfs_icsb_unlock_cntr( } -STATIC inline void +STATIC_INLINE void xfs_icsb_lock_all_counters( xfs_mount_t *mp) { @@ -1832,7 +2081,7 @@ xfs_icsb_lock_all_counters( } } -STATIC inline void +STATIC_INLINE void xfs_icsb_unlock_all_counters( xfs_mount_t *mp) { @@ -1879,7 +2128,7 @@ xfs_icsb_counter_disabled( return test_bit(field, &mp->m_icsb_counters); } -STATIC int +STATIC void xfs_icsb_disable_counter( xfs_mount_t *mp, xfs_sb_field_t field) @@ -1888,11 +2137,22 @@ xfs_icsb_disable_counter( ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + /* + * If we are already disabled, then there is nothing to do + * here. We check before locking all the counters to avoid + * the expensive lock operation when being called in the + * slow path and the counter is already disabled. This is + * safe because the only time we set or clear this state is under + * the m_icsb_mutex. + */ + if (xfs_icsb_counter_disabled(mp, field)) + return; + xfs_icsb_lock_all_counters(mp); if (!test_and_set_bit(field, &mp->m_icsb_counters)) { /* drain back to superblock */ - xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT); + xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); switch(field) { case XFS_SBS_ICOUNT: mp->m_sb.sb_icount = cnt.icsb_icount; @@ -1909,8 +2169,6 @@ xfs_icsb_disable_counter( } xfs_icsb_unlock_all_counters(mp); - - return 0; } STATIC void @@ -1948,76 +2206,64 @@ xfs_icsb_enable_counter( xfs_icsb_unlock_all_counters(mp); } -STATIC void -xfs_icsb_sync_counters_int( +void +xfs_icsb_sync_counters_locked( xfs_mount_t *mp, int flags) { xfs_icsb_cnts_t cnt; - int s; - - /* Pass 1: lock all counters */ - if ((flags & XFS_ICSB_SB_LOCKED) == 0) - s = XFS_SB_LOCK(mp); xfs_icsb_count(mp, &cnt, flags); - /* Step 3: update mp->m_sb fields */ if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) mp->m_sb.sb_icount = cnt.icsb_icount; if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) mp->m_sb.sb_ifree = cnt.icsb_ifree; if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; - - if ((flags & XFS_ICSB_SB_LOCKED) == 0) - XFS_SB_UNLOCK(mp, s); } /* * Accurate update of per-cpu counters to incore superblock */ -STATIC void -xfs_icsb_sync_counters( - xfs_mount_t *mp) -{ - xfs_icsb_sync_counters_int(mp, 0); -} - -/* - * lazy addition used for things like df, background sb syncs, etc - */ void -xfs_icsb_sync_counters_lazy( - xfs_mount_t *mp) +xfs_icsb_sync_counters( + xfs_mount_t *mp, + int flags) { - xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT); + spin_lock(&mp->m_sb_lock); + xfs_icsb_sync_counters_locked(mp, flags); + spin_unlock(&mp->m_sb_lock); } /* * Balance and enable/disable counters as necessary. * - * Thresholds for re-enabling counters are somewhat magic. - * inode counts are chosen to be the same number as single - * on disk allocation chunk per CPU, and free blocks is - * something far enough zero that we aren't going thrash - * when we get near ENOSPC. + * Thresholds for re-enabling counters are somewhat magic. inode counts are + * chosen to be the same number as single on disk allocation chunk per CPU, and + * free blocks is something far enough zero that we aren't going thrash when we + * get near ENOSPC. We also need to supply a minimum we require per cpu to + * prevent looping endlessly when xfs_alloc_space asks for more than will + * be distributed to a single CPU but each CPU has enough blocks to be + * reenabled. + * + * Note that we can be called when counters are already disabled. + * xfs_icsb_disable_counter() optimises the counter locking in this case to + * prevent locking every per-cpu counter needlessly. */ -#define XFS_ICSB_INO_CNTR_REENABLE 64 + +#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ - (512 + XFS_ALLOC_SET_ASIDE(mp)) + (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) STATIC void -xfs_icsb_balance_counter( +xfs_icsb_balance_counter_locked( xfs_mount_t *mp, xfs_sb_field_t field, - int flags) + int min_per_cpu) { uint64_t count, resid; int weight = num_online_cpus(); - int s; - - if (!(flags & XFS_ICSB_SB_LOCKED)) - s = XFS_SB_LOCK(mp); + uint64_t min = (uint64_t)min_per_cpu; /* disable counter and sync counter */ xfs_icsb_disable_counter(mp, field); @@ -2027,20 +2273,20 @@ xfs_icsb_balance_counter( case XFS_SBS_ICOUNT: count = mp->m_sb.sb_icount; resid = do_div(count, weight); - if (count < XFS_ICSB_INO_CNTR_REENABLE) - goto out; + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) + return; break; case XFS_SBS_IFREE: count = mp->m_sb.sb_ifree; resid = do_div(count, weight); - if (count < XFS_ICSB_INO_CNTR_REENABLE) - goto out; + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) + return; break; case XFS_SBS_FDBLOCKS: count = mp->m_sb.sb_fdblocks; resid = do_div(count, weight); - if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp)) - goto out; + if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) + return; break; default: BUG(); @@ -2049,37 +2295,52 @@ xfs_icsb_balance_counter( } xfs_icsb_enable_counter(mp, field, count, resid); -out: - if (!(flags & XFS_ICSB_SB_LOCKED)) - XFS_SB_UNLOCK(mp, s); +} + +STATIC void +xfs_icsb_balance_counter( + xfs_mount_t *mp, + xfs_sb_field_t fields, + int min_per_cpu) +{ + spin_lock(&mp->m_sb_lock); + xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); + spin_unlock(&mp->m_sb_lock); } STATIC int -xfs_icsb_modify_counters_int( +xfs_icsb_modify_counters( xfs_mount_t *mp, xfs_sb_field_t field, - int delta, - int rsvd, - int flags) + int64_t delta, + int rsvd) { xfs_icsb_cnts_t *icsbp; long long lcounter; /* long counter for 64 bit fields */ - int cpu, s, locked = 0; - int ret = 0, balance_done = 0; + int cpu, ret = 0; + might_sleep(); again: cpu = get_cpu(); - icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu), - xfs_icsb_lock_cntr(icsbp); + icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu); + + /* + * if the counter is disabled, go to slow path + */ if (unlikely(xfs_icsb_counter_disabled(mp, field))) goto slow_path; + xfs_icsb_lock_cntr(icsbp); + if (unlikely(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock_cntr(icsbp); + goto slow_path; + } switch (field) { case XFS_SBS_ICOUNT: lcounter = icsbp->icsb_icount; lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_icount = lcounter; break; @@ -2087,7 +2348,7 @@ again: lcounter = icsbp->icsb_ifree; lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_ifree = lcounter; break; @@ -2097,7 +2358,7 @@ again: lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); break; default: @@ -2106,72 +2367,78 @@ again: } xfs_icsb_unlock_cntr(icsbp); put_cpu(); - if (locked) - XFS_SB_UNLOCK(mp, s); return 0; - /* - * The slow path needs to be run with the SBLOCK - * held so that we prevent other threads from - * attempting to run this path at the same time. - * this provides exclusion for the balancing code, - * and exclusive fallback if the balance does not - * provide enough resources to continue in an unlocked - * manner. - */ slow_path: - xfs_icsb_unlock_cntr(icsbp); put_cpu(); - /* need to hold superblock incase we need - * to disable a counter */ - if (!(flags & XFS_ICSB_SB_LOCKED)) { - s = XFS_SB_LOCK(mp); - locked = 1; - flags |= XFS_ICSB_SB_LOCKED; - } - if (!balance_done) { - xfs_icsb_balance_counter(mp, field, flags); - balance_done = 1; + /* + * serialise with a mutex so we don't burn lots of cpu on + * the superblock lock. We still need to hold the superblock + * lock, however, when we modify the global structures. + */ + xfs_icsb_lock(mp); + + /* + * Now running atomically. + * + * If the counter is enabled, someone has beaten us to rebalancing. + * Drop the lock and try again in the fast path.... + */ + if (!(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock(mp); goto again; - } else { - /* - * we might not have enough on this local - * cpu to allocate for a bulk request. - * We need to drain this field from all CPUs - * and disable the counter fastpath - */ - xfs_icsb_disable_counter(mp, field); } + /* + * The counter is currently disabled. Because we are + * running atomically here, we know a rebalance cannot + * be in progress. Hence we can go straight to operating + * on the global superblock. We do not call xfs_mod_incore_sb() + * here even though we need to get the m_sb_lock. Doing so + * will cause us to re-enter this function and deadlock. + * Hence we get the m_sb_lock ourselves and then call + * xfs_mod_incore_sb_unlocked() as the unlocked path operates + * directly on the global counters. + */ + spin_lock(&mp->m_sb_lock); ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + spin_unlock(&mp->m_sb_lock); - if (locked) - XFS_SB_UNLOCK(mp, s); + /* + * Now that we've modified the global superblock, we + * may be able to re-enable the distributed counters + * (e.g. lots of space just got freed). After that + * we are done. + */ + if (ret != ENOSPC) + xfs_icsb_balance_counter(mp, field, 0); + xfs_icsb_unlock(mp); return ret; -} -STATIC int -xfs_icsb_modify_counters( - xfs_mount_t *mp, - xfs_sb_field_t field, - int delta, - int rsvd) -{ - return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0); -} +balance_counter: + xfs_icsb_unlock_cntr(icsbp); + put_cpu(); -/* - * Called when superblock is already locked - */ -STATIC int -xfs_icsb_modify_counters_locked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int delta, - int rsvd) -{ - return xfs_icsb_modify_counters_int(mp, field, delta, - rsvd, XFS_ICSB_SB_LOCKED); + /* + * We may have multiple threads here if multiple per-cpu + * counters run dry at the same time. This will mean we can + * do more balances than strictly necessary but it is not + * the common slowpath case. + */ + xfs_icsb_lock(mp); + + /* + * running atomically. + * + * This will leave the counter in the correct state for future + * accesses. After the rebalance, we simply try again and our retry + * will either succeed through the fast path or slow path without + * another balance operation being required. + */ + xfs_icsb_balance_counter(mp, field, delta); + xfs_icsb_unlock(mp); + goto again; } + #endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b2bd4be..f3c1024 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,6 +18,7 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ + typedef struct xfs_trans_reservations { uint tr_write; /* extent alloc trans */ uint tr_itruncate; /* truncate trans */ @@ -53,42 +54,30 @@ typedef struct xfs_trans_reservations { #else struct cred; struct log; -struct bhv_vfs; -struct bhv_vnode; struct xfs_mount_args; -struct xfs_ihash; -struct xfs_chash; struct xfs_inode; -struct xfs_perag; -struct xfs_iocore; struct xfs_bmbt_irec; struct xfs_bmap_free; struct xfs_extdelta; struct xfs_swapext; - -extern struct bhv_vfsops xfs_vfsops; -extern struct bhv_vnodeops xfs_vnodeops; - -#define AIL_LOCK_T lock_t -#define AIL_LOCKINIT(x,y) spinlock_init(x,y) -#define AIL_LOCK_DESTROY(x) spinlock_destroy(x) -#define AIL_LOCK(mp,s) s=mutex_spinlock(&(mp)->m_ail_lock) -#define AIL_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_ail_lock, s) - +struct xfs_mru_cache; +struct xfs_nameops; /* * Prototypes and functions for the Data Migration subsystem. */ -typedef int (*xfs_send_data_t)(int, struct bhv_vnode *, - xfs_off_t, size_t, int, bhv_vrwlock_t *); +typedef int (*xfs_send_data_t)(int, struct xfs_inode *, + xfs_off_t, size_t, int, int *); typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint); -typedef int (*xfs_send_destroy_t)(struct bhv_vnode *, dm_right_t); -typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct bhv_vfs *, - struct bhv_vnode *, - dm_right_t, struct bhv_vnode *, dm_right_t, - char *, char *, mode_t, int, int); -typedef void (*xfs_send_unmount_t)(struct bhv_vfs *, struct bhv_vnode *, +typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t); +typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, + struct xfs_inode *, dm_right_t, + struct xfs_inode *, dm_right_t, + const char *, const char *, mode_t, int, int); +typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, + char *, char *); +typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, dm_right_t, mode_t, int, int); typedef struct xfs_dmops { @@ -96,21 +85,24 @@ typedef struct xfs_dmops { xfs_send_mmap_t xfs_send_mmap; xfs_send_destroy_t xfs_send_destroy; xfs_send_namesp_t xfs_send_namesp; + xfs_send_mount_t xfs_send_mount; xfs_send_unmount_t xfs_send_unmount; } xfs_dmops_t; -#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \ - (*(mp)->m_dm_ops.xfs_send_data)(ev,vp,off,len,fl,lock) +#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \ + (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock) #define XFS_SEND_MMAP(mp, vma,fl) \ - (*(mp)->m_dm_ops.xfs_send_mmap)(vma,fl) -#define XFS_SEND_DESTROY(mp, vp,right) \ - (*(mp)->m_dm_ops.xfs_send_destroy)(vp,right) + (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl) +#define XFS_SEND_DESTROY(mp, ip,right) \ + (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right) #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops.xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_PREUNMOUNT(mp, vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops.xfs_send_namesp)(DM_EVENT_PREUNMOUNT,vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_UNMOUNT(mp, vfsp,vp,right,mode,rval,fl) \ - (*(mp)->m_dm_ops.xfs_send_unmount)(vfsp,vp,right,mode,rval,fl) + (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) +#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ + (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) +#define XFS_SEND_MOUNT(mp,right,path,name) \ + (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) +#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \ + (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl) /* @@ -122,7 +114,7 @@ struct xfs_dqtrxops; struct xfs_quotainfo; typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *); -typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int); +typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint); typedef int (*xfs_qmunmount_t)(struct xfs_mount *); typedef void (*xfs_qmdone_t)(struct xfs_mount *); typedef void (*xfs_dqrele_t)(struct xfs_dquot *); @@ -140,6 +132,9 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)( struct xfs_dquot **, struct xfs_dquot *); typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, uint); +typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *); +typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); +typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t); typedef struct xfs_qmops { xfs_qminit_t xfs_qminit; @@ -155,134 +150,44 @@ typedef struct xfs_qmops { xfs_dqvoprename_t xfs_dqvoprename; xfs_dqvopchown_t xfs_dqvopchown; xfs_dqvopchownresv_t xfs_dqvopchownresv; + xfs_dqstatvfs_t xfs_dqstatvfs; + xfs_dqsync_t xfs_dqsync; + xfs_quotactl_t xfs_quotactl; struct xfs_dqtrxops *xfs_dqtrxops; } xfs_qmops_t; #define XFS_QM_INIT(mp, mnt, fl) \ - (*(mp)->m_qm_ops.xfs_qminit)(mp, mnt, fl) -#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \ - (*(mp)->m_qm_ops.xfs_qmmount)(mp, mnt, fl, mfsi_flags) + (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl) +#define XFS_QM_MOUNT(mp, mnt, fl) \ + (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl) #define XFS_QM_UNMOUNT(mp) \ - (*(mp)->m_qm_ops.xfs_qmunmount)(mp) + (*(mp)->m_qm_ops->xfs_qmunmount)(mp) #define XFS_QM_DONE(mp) \ - (*(mp)->m_qm_ops.xfs_qmdone)(mp) + (*(mp)->m_qm_ops->xfs_qmdone)(mp) #define XFS_QM_DQRELE(mp, dq) \ - (*(mp)->m_qm_ops.xfs_dqrele)(dq) + (*(mp)->m_qm_ops->xfs_dqrele)(dq) #define XFS_QM_DQATTACH(mp, ip, fl) \ - (*(mp)->m_qm_ops.xfs_dqattach)(ip, fl) + (*(mp)->m_qm_ops->xfs_dqattach)(ip, fl) #define XFS_QM_DQDETACH(mp, ip) \ - (*(mp)->m_qm_ops.xfs_dqdetach)(ip) + (*(mp)->m_qm_ops->xfs_dqdetach)(ip) #define XFS_QM_DQPURGEALL(mp, fl) \ - (*(mp)->m_qm_ops.xfs_dqpurgeall)(mp, fl) + (*(mp)->m_qm_ops->xfs_dqpurgeall)(mp, fl) #define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \ - (*(mp)->m_qm_ops.xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2) + (*(mp)->m_qm_ops->xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2) #define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \ - (*(mp)->m_qm_ops.xfs_dqvopcreate)(tp, ip, dq1, dq2) + (*(mp)->m_qm_ops->xfs_dqvopcreate)(tp, ip, dq1, dq2) #define XFS_QM_DQVOPRENAME(mp, ip) \ - (*(mp)->m_qm_ops.xfs_dqvoprename)(ip) + (*(mp)->m_qm_ops->xfs_dqvoprename)(ip) #define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \ - (*(mp)->m_qm_ops.xfs_dqvopchown)(tp, ip, dqp, dq) + (*(mp)->m_qm_ops->xfs_dqvopchown)(tp, ip, dqp, dq) #define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \ - (*(mp)->m_qm_ops.xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl) - - -/* - * Prototypes and functions for I/O core modularization. - */ - -typedef int (*xfs_ioinit_t)(struct bhv_vfs *, - struct xfs_mount_args *, int); -typedef int (*xfs_bmapi_t)(struct xfs_trans *, void *, - xfs_fileoff_t, xfs_filblks_t, int, - xfs_fsblock_t *, xfs_extlen_t, - struct xfs_bmbt_irec *, int *, - struct xfs_bmap_free *, struct xfs_extdelta *); -typedef int (*xfs_bunmapi_t)(struct xfs_trans *, - void *, xfs_fileoff_t, - xfs_filblks_t, int, xfs_extnum_t, - xfs_fsblock_t *, struct xfs_bmap_free *, - struct xfs_extdelta *, int *); -typedef int (*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *); -typedef int (*xfs_iomap_write_direct_t)( - void *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *, int); -typedef int (*xfs_iomap_write_delay_t)( - void *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -typedef int (*xfs_iomap_write_allocate_t)( - void *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); -typedef int (*xfs_iomap_write_unwritten_t)( - void *, xfs_off_t, size_t); -typedef uint (*xfs_lck_map_shared_t)(void *); -typedef void (*xfs_lock_t)(void *, uint); -typedef void (*xfs_lock_demote_t)(void *, uint); -typedef int (*xfs_lock_nowait_t)(void *, uint); -typedef void (*xfs_unlk_t)(void *, unsigned int); -typedef xfs_fsize_t (*xfs_size_t)(void *); -typedef xfs_fsize_t (*xfs_iodone_t)(struct bhv_vfs *); -typedef int (*xfs_swap_extents_t)(void *, void *, - struct xfs_swapext*); - -typedef struct xfs_ioops { - xfs_ioinit_t xfs_ioinit; - xfs_bmapi_t xfs_bmapi_func; - xfs_bunmapi_t xfs_bunmapi_func; - xfs_bmap_eof_t xfs_bmap_eof_func; - xfs_iomap_write_direct_t xfs_iomap_write_direct; - xfs_iomap_write_delay_t xfs_iomap_write_delay; - xfs_iomap_write_allocate_t xfs_iomap_write_allocate; - xfs_iomap_write_unwritten_t xfs_iomap_write_unwritten; - xfs_lock_t xfs_ilock; - xfs_lck_map_shared_t xfs_lck_map_shared; - xfs_lock_demote_t xfs_ilock_demote; - xfs_lock_nowait_t xfs_ilock_nowait; - xfs_unlk_t xfs_unlock; - xfs_size_t xfs_size_func; - xfs_iodone_t xfs_iodone; - xfs_swap_extents_t xfs_swap_extents_func; -} xfs_ioops_t; - -#define XFS_IOINIT(vfsp, args, flags) \ - (*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags) -#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist,delta) \ - (*(mp)->m_io_ops.xfs_bmapi_func) \ - (trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist,delta) -#define XFS_BUNMAPI(mp, trans,io,bno,len,f,nexts,first,flist,delta,done) \ - (*(mp)->m_io_ops.xfs_bunmapi_func) \ - (trans,(io)->io_obj,bno,len,f,nexts,first,flist,delta,done) -#define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \ - (*(mp)->m_io_ops.xfs_bmap_eof_func) \ - ((io)->io_obj, endoff, whichfork, eof) -#define XFS_IOMAP_WRITE_DIRECT(mp, io, offset, count, flags, mval, nmap, found)\ - (*(mp)->m_io_ops.xfs_iomap_write_direct) \ - ((io)->io_obj, offset, count, flags, mval, nmap, found) -#define XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, flags, mval, nmap) \ - (*(mp)->m_io_ops.xfs_iomap_write_delay) \ - ((io)->io_obj, offset, count, flags, mval, nmap) -#define XFS_IOMAP_WRITE_ALLOCATE(mp, io, offset, count, mval, nmap) \ - (*(mp)->m_io_ops.xfs_iomap_write_allocate) \ - ((io)->io_obj, offset, count, mval, nmap) -#define XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count) \ - (*(mp)->m_io_ops.xfs_iomap_write_unwritten) \ - ((io)->io_obj, offset, count) -#define XFS_LCK_MAP_SHARED(mp, io) \ - (*(mp)->m_io_ops.xfs_lck_map_shared)((io)->io_obj) -#define XFS_ILOCK(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_ilock)((io)->io_obj, mode) -#define XFS_ILOCK_NOWAIT(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_ilock_nowait)((io)->io_obj, mode) -#define XFS_IUNLOCK(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_unlock)((io)->io_obj, mode) -#define XFS_ILOCK_DEMOTE(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_ilock_demote)((io)->io_obj, mode) -#define XFS_SIZE(mp, io) \ - (*(mp)->m_io_ops.xfs_size_func)((io)->io_obj) -#define XFS_IODONE(vfsp) \ - (*(mp)->m_io_ops.xfs_iodone)(vfsp) -#define XFS_SWAP_EXTENTS(mp, io, tio, sxp) \ - (*(mp)->m_io_ops.xfs_swap_extents_func) \ - ((io)->io_obj, (tio)->io_obj, sxp) + (*(mp)->m_qm_ops->xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl) +#define XFS_QM_DQSTATVFS(ip, statp) \ + (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp) +#define XFS_QM_DQSYNC(mp, flags) \ + (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags) +#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \ + (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr) #ifdef HAVE_PERCPU_SB @@ -302,25 +207,36 @@ typedef struct xfs_icsb_cnts { #define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ -#define XFS_ICSB_SB_LOCKED (1 << 0) /* sb already locked */ #define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ extern int xfs_icsb_init_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters_lazy(struct xfs_mount *); +extern void xfs_icsb_reinit_counters(struct xfs_mount *); +extern void xfs_icsb_destroy_counters(struct xfs_mount *); +extern void xfs_icsb_sync_counters(struct xfs_mount *, int); +extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); #else -#define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_sync_counters_lazy(mp) do { } while (0) +#define xfs_icsb_init_counters(mp) (0) +#define xfs_icsb_destroy_counters(mp) do { } while (0) +#define xfs_icsb_reinit_counters(mp) do { } while (0) +#define xfs_icsb_sync_counters(mp, flags) do { } while (0) +#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) #endif +typedef struct xfs_ail { + struct list_head xa_ail; + uint xa_gen; + struct task_struct *xa_task; + xfs_lsn_t xa_target; +} xfs_ail_t; + typedef struct xfs_mount { - bhv_desc_t m_bhv; /* vfs xfs behavior */ + struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ - AIL_LOCK_T m_ail_lock; /* fs AIL mutex */ - xfs_ail_entry_t m_ail; /* fs active log item list */ - uint m_ail_gen; /* fs AIL generation count */ + spinlock_t m_ail_lock; /* fs AIL mutex */ + xfs_ail_t m_ail; /* fs active log item list */ xfs_sb_t m_sb; /* copy of fs superblock */ - lock_t m_sb_lock; /* sb counter mutex */ + spinlock_t m_sb_lock; /* sb counter lock */ struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ int m_fsname_len; /* strlen of fs name */ @@ -329,10 +245,8 @@ typedef struct xfs_mount { int m_bsize; /* fs logical block size */ xfs_agnumber_t m_agfrotor; /* last ag where space found */ xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ - lock_t m_agirotor_lock;/* .. and lock protecting it */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_ihsize; /* size of next field */ - struct xfs_ihash *m_ihash; /* fs private inode hash table*/ struct xfs_inode *m_inodes; /* active inode list */ struct list_head m_del_inodes; /* inodes to reclaim */ mutex_t m_ilock; /* inode list mutex */ @@ -358,7 +272,6 @@ typedef struct xfs_mount { __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ __uint8_t m_agino_log; /* #bits for agino in inum */ - __uint8_t m_nreadaheads; /* #readahead buffers */ __uint16_t m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ @@ -374,7 +287,7 @@ typedef struct xfs_mount { uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ struct xfs_perag *m_perag; /* per-ag accounting info */ struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ - sema_t m_growlock; /* growfs mutex */ + struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ @@ -403,6 +316,7 @@ typedef struct xfs_mount { __uint8_t m_inode_quiesce;/* call quiesce on new inodes. field governed by m_ilock */ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ int m_dirblksize; /* directory block sz--bytes */ int m_dirblkfsbs; /* directory block sz--fsbs */ xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ @@ -411,26 +325,33 @@ typedef struct xfs_mount { uint m_chsize; /* size of next field */ struct xfs_chash *m_chash; /* fs private inode per-cluster * hash table */ - struct xfs_dmops m_dm_ops; /* vector of DMI ops */ - struct xfs_qmops m_qm_ops; /* vector of XQM ops */ - struct xfs_ioops m_io_ops; /* vector of I/O ops */ + struct xfs_dmops *m_dm_ops; /* vector of DMI ops */ + struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ unsigned long m_icsb_counters; /* disabled per-cpu counters */ struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ + struct mutex m_icsb_mutex; /* balancer sync lock */ #endif + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct task_struct *m_sync_task; /* generalised sync thread */ + bhv_vfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ + struct list_head m_sync_list; /* sync thread work item list */ + spinlock_t m_sync_lock; /* work item list lock */ + int m_sync_seq; /* sync thread generation no. */ + wait_queue_head_t m_wait_single_sync_task; } xfs_mount_t; /* * Flags for m_flags. */ -#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops +#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_INO64 (1ULL << 1) - /* (1ULL << 2) -- currently unused */ - /* (1ULL << 3) -- currently unused */ +#define XFS_MOUNT_INO64 (1ULL << 1) +#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ +#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for disk errors in metadata */ @@ -439,7 +360,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment allocations */ #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ - /* (1ULL << 9) -- currently unused */ +#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ #define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */ #define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ @@ -447,18 +368,21 @@ typedef struct xfs_mount { /* osyncisdsync is now default*/ #define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above * 32 bits in size */ - /* (1ULL << 15) -- currently unused */ +#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ #define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ #define XFS_MOUNT_BARRIER (1ULL << 17) -#define XFS_MOUNT_IDELETE (1ULL << 18) /* delete empty inode clusters*/ +#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ #define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width * allocation */ -#define XFS_MOUNT_IHASHSIZE (1ULL << 20) /* inode hash table size */ +#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ #define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred * I/O size in stat() */ #define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock counters */ +#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams + allocator */ +#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ /* @@ -483,7 +407,7 @@ typedef struct xfs_mount { /* * Allow large block sizes to be reported to userspace programs if the - * "largeio" mount option is used. + * "largeio" mount option is used. * * If compatibility mode is specified, simply return the basic unit of caching * so that we don't get inefficient read/modify/write I/O from user apps. @@ -507,43 +431,19 @@ xfs_preferred_iosize(xfs_mount_t *mp) #define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) +#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ + ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, + int lnnum); #define xfs_force_shutdown(m,f) \ - bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__) + xfs_do_force_shutdown(m, f, __FILE__, __LINE__) /* * Flags for xfs_mountfs */ -#define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */ -#define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */ -/* XFS_MFSI_RRINODES */ -#define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */ - /* log recovery */ -#define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */ -/* XFS_MFSI_CONVERT_SUNIT */ #define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ -/* - * Macros for getting from mount to vfs and back. - */ -#define XFS_MTOVFS(mp) xfs_mtovfs(mp) -static inline struct bhv_vfs *xfs_mtovfs(xfs_mount_t *mp) -{ - return bhvtovfs(&mp->m_bhv); -} - -#define XFS_BHVTOM(bdp) xfs_bhvtom(bdp) -static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp) -{ - return (xfs_mount_t *)BHV_PDATA(bdp); -} - -#define XFS_VFSTOM(vfs) xfs_vfstom(vfs) -static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs) -{ - return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops)); -} - #define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) @@ -562,49 +462,82 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* + * perag get/put wrappers for eventual ref counting + */ +static inline xfs_perag_t * +xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino) +{ + return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)]; +} + +static inline void +xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag) +{ + /* nothing to see here, move along */ +} + +/* + * Per-cpu superblock locking functions + */ +#ifdef HAVE_PERCPU_SB +STATIC_INLINE void +xfs_icsb_lock(xfs_mount_t *mp) +{ + mutex_lock(&mp->m_icsb_mutex); +} + +STATIC_INLINE void +xfs_icsb_unlock(xfs_mount_t *mp) +{ + mutex_unlock(&mp->m_icsb_mutex); +} +#else +#define xfs_icsb_lock(mp) +#define xfs_icsb_unlock(mp) +#endif + +/* * This structure is for use by the xfs_mod_incore_sb_batch() routine. + * xfs_growfs can specify a few fields which are more than int limit */ typedef struct xfs_mod_sb { xfs_sb_field_t msb_field; /* Field to modify, see below */ - int msb_delta; /* Change to make to specified field */ + int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; #define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) #define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) -#define XFS_SB_LOCK(mp) mutex_spinlock(&(mp)->m_sb_lock) -#define XFS_SB_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_sb_lock,(s)) -extern xfs_mount_t *xfs_mount_init(void); extern void xfs_mod_sb(xfs_trans_t *, __int64_t); -extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv); -extern int xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int); +extern int xfs_log_sbcount(xfs_mount_t *, uint); +extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); -extern int xfs_unmountfs(xfs_mount_t *, struct cred *); -extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *); +extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_unmount_flush(xfs_mount_t *, int); -extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int); +extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, - int, int); + int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); -extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); -extern int xfs_syncsub(xfs_mount_t *, int, int, int *); -extern int xfs_sync_inodes(xfs_mount_t *, int, int, int *); -extern xfs_agnumber_t xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *, - xfs_agnumber_t); -extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t); - -extern struct xfs_dmops xfs_dmcore_stub; -extern struct xfs_qmops xfs_qmcore_stub; -extern struct xfs_ioops xfs_iocore_xfs; - -extern int xfs_init(void); -extern void xfs_cleanup(void); +extern int xfs_fs_writable(xfs_mount_t *); +extern int xfs_syncsub(xfs_mount_t *, int, int *); +extern int xfs_sync_inodes(xfs_mount_t *, int, int *); +extern xfs_agnumber_t xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); + +extern int xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *); +extern void xfs_dmops_put(struct xfs_mount *); +extern int xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *); +extern void xfs_qmops_put(struct xfs_mount *); + +extern struct xfs_dmops xfs_dmcore_xfs; #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c new file mode 100644 index 0000000..afee7eb --- /dev/null +++ b/fs/xfs/xfs_mru_cache.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_mru_cache.h" + +/* + * The MRU Cache data structure consists of a data store, an array of lists and + * a lock to protect its internal state. At initialisation time, the client + * supplies an element lifetime in milliseconds and a group count, as well as a + * function pointer to call when deleting elements. A data structure for + * queueing up work in the form of timed callbacks is also included. + * + * The group count controls how many lists are created, and thereby how finely + * the elements are grouped in time. When reaping occurs, all the elements in + * all the lists whose time has expired are deleted. + * + * To give an example of how this works in practice, consider a client that + * initialises an MRU Cache with a lifetime of ten seconds and a group count of + * five. Five internal lists will be created, each representing a two second + * period in time. When the first element is added, time zero for the data + * structure is initialised to the current time. + * + * All the elements added in the first two seconds are appended to the first + * list. Elements added in the third second go into the second list, and so on. + * If an element is accessed at any point, it is removed from its list and + * inserted at the head of the current most-recently-used list. + * + * The reaper function will have nothing to do until at least twelve seconds + * have elapsed since the first element was added. The reason for this is that + * if it were called at t=11s, there could be elements in the first list that + * have only been inactive for nine seconds, so it still does nothing. If it is + * called anywhere between t=12 and t=14 seconds, it will delete all the + * elements that remain in the first list. It's therefore possible for elements + * to remain in the data store even after they've been inactive for up to + * (t + t/g) seconds, where t is the inactive element lifetime and g is the + * number of groups. + * + * The above example assumes that the reaper function gets called at least once + * every (t/g) seconds. If it is called less frequently, unused elements will + * accumulate in the reap list until the reaper function is eventually called. + * The current implementation uses work queue callbacks to carefully time the + * reaper function calls, so this should happen rarely, if at all. + * + * From a design perspective, the primary reason for the choice of a list array + * representing discrete time intervals is that it's only practical to reap + * expired elements in groups of some appreciable size. This automatically + * introduces a granularity to element lifetimes, so there's no point storing an + * individual timeout with each element that specifies a more precise reap time. + * The bonus is a saving of sizeof(long) bytes of memory per element stored. + * + * The elements could have been stored in just one list, but an array of + * counters or pointers would need to be maintained to allow them to be divided + * up into discrete time groups. More critically, the process of touching or + * removing an element would involve walking large portions of the entire list, + * which would have a detrimental effect on performance. The additional memory + * requirement for the array of list heads is minimal. + * + * When an element is touched or deleted, it needs to be removed from its + * current list. Doubly linked lists are used to make the list maintenance + * portion of these operations O(1). Since reaper timing can be imprecise, + * inserts and lookups can occur when there are no free lists available. When + * this happens, all the elements on the LRU list need to be migrated to the end + * of the reap list. To keep the list maintenance portion of these operations + * O(1) also, list tails need to be accessible without walking the entire list. + * This is the reason why doubly linked list heads are used. + */ + +/* + * An MRU Cache is a dynamic data structure that stores its elements in a way + * that allows efficient lookups, but also groups them into discrete time + * intervals based on insertion time. This allows elements to be efficiently + * and automatically reaped after a fixed period of inactivity. + * + * When a client data pointer is stored in the MRU Cache it needs to be added to + * both the data store and to one of the lists. It must also be possible to + * access each of these entries via the other, i.e. to: + * + * a) Walk a list, removing the corresponding data store entry for each item. + * b) Look up a data store entry, then access its list entry directly. + * + * To achieve both of these goals, each entry must contain both a list entry and + * a key, in addition to the user's data pointer. Note that it's not a good + * idea to have the client embed one of these structures at the top of their own + * data structure, because inserting the same item more than once would most + * likely result in a loop in one of the lists. That's a sure-fire recipe for + * an infinite loop in the code. + */ +typedef struct xfs_mru_cache_elem +{ + struct list_head list_node; + unsigned long key; + void *value; +} xfs_mru_cache_elem_t; + +static kmem_zone_t *xfs_mru_elem_zone; +static struct workqueue_struct *xfs_mru_reap_wq; + +/* + * When inserting, destroying or reaping, it's first necessary to update the + * lists relative to a particular time. In the case of destroying, that time + * will be well in the future to ensure that all items are moved to the reap + * list. In all other cases though, the time will be the current time. + * + * This function enters a loop, moving the contents of the LRU list to the reap + * list again and again until either a) the lists are all empty, or b) time zero + * has been advanced sufficiently to be within the immediate element lifetime. + * + * Case a) above is detected by counting how many groups are migrated and + * stopping when they've all been moved. Case b) is detected by monitoring the + * time_zero field, which is updated as each group is migrated. + * + * The return value is the earliest time that more migration could be needed, or + * zero if there's no need to schedule more work because the lists are empty. + */ +STATIC unsigned long +_xfs_mru_cache_migrate( + xfs_mru_cache_t *mru, + unsigned long now) +{ + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; + + /* Nothing to do if the data store is empty. */ + if (!mru->time_zero) + return 0; + + /* While time zero is older than the time spanned by all the lists. */ + while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { + + /* + * If the LRU list isn't empty, migrate its elements to the tail + * of the reap list. + */ + lru_list = mru->lists + mru->lru_grp; + if (!list_empty(lru_list)) + list_splice_init(lru_list, mru->reap_list.prev); + + /* + * Advance the LRU group number, freeing the old LRU list to + * become the new MRU list; advance time zero accordingly. + */ + mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; + mru->time_zero += mru->grp_time; + + /* + * If reaping is so far behind that all the elements on all the + * lists have been migrated to the reap list, it's now empty. + */ + if (++migrated == mru->grp_count) { + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; + } + } + + /* Find the first non-empty list from the LRU end. */ + for (grp = 0; grp < mru->grp_count; grp++) { + + /* Check the grp'th list from the LRU end. */ + lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); + if (!list_empty(lru_list)) + return mru->time_zero + + (mru->grp_count + grp) * mru->grp_time; + } + + /* All the lists must be empty. */ + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; +} + +/* + * When inserting or doing a lookup, an element needs to be inserted into the + * MRU list. The lists must be migrated first to ensure that they're + * up-to-date, otherwise the new element could be given a shorter lifetime in + * the cache than it should. + */ +STATIC void +_xfs_mru_cache_list_insert( + xfs_mru_cache_t *mru, + xfs_mru_cache_elem_t *elem) +{ + unsigned int grp = 0; + unsigned long now = jiffies; + + /* + * If the data store is empty, initialise time zero, leave grp set to + * zero and start the work queue timer if necessary. Otherwise, set grp + * to the number of group times that have elapsed since time zero. + */ + if (!_xfs_mru_cache_migrate(mru, now)) { + mru->time_zero = now; + if (!mru->queued) { + mru->queued = 1; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, + mru->grp_count * mru->grp_time); + } + } else { + grp = (now - mru->time_zero) / mru->grp_time; + grp = (mru->lru_grp + grp) % mru->grp_count; + } + + /* Insert the element at the tail of the corresponding list. */ + list_add_tail(&elem->list_node, mru->lists + grp); +} + +/* + * When destroying or reaping, all the elements that were migrated to the reap + * list need to be deleted. For each element this involves removing it from the + * data store, removing it from the reap list, calling the client's free + * function and deleting the element from the element zone. + * + * We get called holding the mru->lock, which we drop and then reacquire. + * Sparse need special help with this to tell it we know what we are doing. + */ +STATIC void +_xfs_mru_cache_clear_reap_list( + xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock) + +{ + xfs_mru_cache_elem_t *elem, *next; + struct list_head tmp; + + INIT_LIST_HEAD(&tmp); + list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { + + /* Remove the element from the data store. */ + radix_tree_delete(&mru->store, elem->key); + + /* + * remove to temp list so it can be freed without + * needing to hold the lock + */ + list_move(&elem->list_node, &tmp); + } + spin_unlock(&mru->lock); + + list_for_each_entry_safe(elem, next, &tmp, list_node) { + + /* Remove the element from the reap list. */ + list_del_init(&elem->list_node); + + /* Call the client's free function with the key and value pointer. */ + mru->free_func(elem->key, elem->value); + + /* Free the element structure. */ + kmem_zone_free(xfs_mru_elem_zone, elem); + } + + spin_lock(&mru->lock); +} + +/* + * We fire the reap timer every group expiry interval so + * we always have a reaper ready to run. This makes shutdown + * and flushing of the reaper easy to do. Hence we need to + * keep when the next reap must occur so we can determine + * at each interval whether there is anything we need to do. + */ +STATIC void +_xfs_mru_cache_reap( + struct work_struct *work) +{ + xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work); + unsigned long now, next; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + next = _xfs_mru_cache_migrate(mru, jiffies); + _xfs_mru_cache_clear_reap_list(mru); + + mru->queued = next; + if ((mru->queued > 0)) { + now = jiffies; + if (next <= now) + next = 0; + else + next -= now; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, next); + } + + spin_unlock(&mru->lock); +} + +int +xfs_mru_cache_init(void) +{ + xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), + "xfs_mru_cache_elem"); + if (!xfs_mru_elem_zone) + goto out; + + xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); + if (!xfs_mru_reap_wq) + goto out_destroy_mru_elem_zone; + + return 0; + + out_destroy_mru_elem_zone: + kmem_zone_destroy(xfs_mru_elem_zone); + out: + return -ENOMEM; +} + +void +xfs_mru_cache_uninit(void) +{ + destroy_workqueue(xfs_mru_reap_wq); + kmem_zone_destroy(xfs_mru_elem_zone); +} + +/* + * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() + * with the address of the pointer, a lifetime value in milliseconds, a group + * count and a free function to use when deleting elements. This function + * returns 0 if the initialisation was successful. + */ +int +xfs_mru_cache_create( + xfs_mru_cache_t **mrup, + unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func) +{ + xfs_mru_cache_t *mru = NULL; + int err = 0, grp; + unsigned int grp_time; + + if (mrup) + *mrup = NULL; + + if (!mrup || !grp_count || !lifetime_ms || !free_func) + return EINVAL; + + if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) + return EINVAL; + + if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + return ENOMEM; + + /* An extra list is needed to avoid reaping up to a grp_time early. */ + mru->grp_count = grp_count + 1; + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + + if (!mru->lists) { + err = ENOMEM; + goto exit; + } + + for (grp = 0; grp < mru->grp_count; grp++) + INIT_LIST_HEAD(mru->lists + grp); + + /* + * We use GFP_KERNEL radix tree preload and do inserts under a + * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. + */ + INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); + INIT_LIST_HEAD(&mru->reap_list); + spin_lock_init(&mru->lock); + INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); + + mru->grp_time = grp_time; + mru->free_func = free_func; + + *mrup = mru; + +exit: + if (err && mru && mru->lists) + kmem_free(mru->lists); + if (err && mru) + kmem_free(mru); + + return err; +} + +/* + * Call xfs_mru_cache_flush() to flush out all cached entries, calling their + * free functions as they're deleted. When this function returns, the caller is + * guaranteed that all the free functions for all the elements have finished + * executing and the reaper is not running. + */ +void +xfs_mru_cache_flush( + xfs_mru_cache_t *mru) +{ + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + if (mru->queued) { + spin_unlock(&mru->lock); + cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); + spin_lock(&mru->lock); + } + + _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time); + _xfs_mru_cache_clear_reap_list(mru); + + spin_unlock(&mru->lock); +} + +void +xfs_mru_cache_destroy( + xfs_mru_cache_t *mru) +{ + if (!mru || !mru->lists) + return; + + xfs_mru_cache_flush(mru); + + kmem_free(mru->lists); + kmem_free(mru); +} + +/* + * To insert an element, call xfs_mru_cache_insert() with the data store, the + * element's key and the client data pointer. This function returns 0 on + * success or ENOMEM if memory for the data element couldn't be allocated. + */ +int +xfs_mru_cache_insert( + xfs_mru_cache_t *mru, + unsigned long key, + void *value) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return EINVAL; + + elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP); + if (!elem) + return ENOMEM; + + if (radix_tree_preload(GFP_KERNEL)) { + kmem_zone_free(xfs_mru_elem_zone, elem); + return ENOMEM; + } + + INIT_LIST_HEAD(&elem->list_node); + elem->key = key; + elem->value = value; + + spin_lock(&mru->lock); + + radix_tree_insert(&mru->store, key, elem); + radix_tree_preload_end(); + _xfs_mru_cache_list_insert(mru, elem); + + spin_unlock(&mru->lock); + + return 0; +} + +/* + * To remove an element without calling the free function, call + * xfs_mru_cache_remove() with the data store and the element's key. On success + * the client data pointer for the removed element is returned, otherwise this + * function will return a NULL pointer. + */ +void * +xfs_mru_cache_remove( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + void *value = NULL; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_delete(&mru->store, key); + if (elem) { + value = elem->value; + list_del(&elem->list_node); + } + + spin_unlock(&mru->lock); + + if (elem) + kmem_zone_free(xfs_mru_elem_zone, elem); + + return value; +} + +/* + * To remove and element and call the free function, call xfs_mru_cache_delete() + * with the data store and the element's key. + */ +void +xfs_mru_cache_delete( + xfs_mru_cache_t *mru, + unsigned long key) +{ + void *value = xfs_mru_cache_remove(mru, key); + + if (value) + mru->free_func(key, value); +} + +/* + * To look up an element using its key, call xfs_mru_cache_lookup() with the + * data store and the element's key. If found, the element will be moved to the + * head of the MRU list to indicate that it's been touched. + * + * The internal data structures are protected by a spinlock that is STILL HELD + * when this function returns. Call xfs_mru_cache_done() to release it. Note + * that it is not safe to call any function that might sleep in the interim. + * + * The implementation could have used reference counting to avoid this + * restriction, but since most clients simply want to get, set or test a member + * of the returned data structure, the extra per-element memory isn't warranted. + * + * If the element isn't found, this function returns NULL and the spinlock is + * released. xfs_mru_cache_done() should NOT be called when this occurs. + * + * Because sparse isn't smart enough to know about conditional lock return + * status, we need to help it get it right by annotating the path that does + * not release the lock. + */ +void * +xfs_mru_cache_lookup( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (elem) { + list_del(&elem->list_node); + _xfs_mru_cache_list_insert(mru, elem); + __release(mru_lock); /* help sparse not be stupid */ + } else + spin_unlock(&mru->lock); + + return elem ? elem->value : NULL; +} + +/* + * To look up an element using its key, but leave its location in the internal + * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this + * function returns NULL. + * + * See the comments above the declaration of the xfs_mru_cache_lookup() function + * for important locking information pertaining to this call. + */ +void * +xfs_mru_cache_peek( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (!elem) + spin_unlock(&mru->lock); + else + __release(mru_lock); /* help sparse not be stupid */ + + return elem ? elem->value : NULL; +} + +/* + * To release the internal data structure spinlock after having performed an + * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() + * with the data store pointer. + */ +void +xfs_mru_cache_done( + xfs_mru_cache_t *mru) __releases(mru->lock) +{ + spin_unlock(&mru->lock); +} diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h new file mode 100644 index 0000000..dd58ea1 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MRU_CACHE_H__ +#define __XFS_MRU_CACHE_H__ + + +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*); + +typedef struct xfs_mru_cache +{ + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +} xfs_mru_cache_t; + +int xfs_mru_cache_init(void); +void xfs_mru_cache_uninit(void); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); +void xfs_mru_cache_flush(xfs_mru_cache_t *mru); +void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); +int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, + void *value); +void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); +void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_done(struct xfs_mru_cache *mru); + +#endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c index 320d63f..a294e58 100644 --- a/fs/xfs/xfs_qmops.c +++ b/fs/xfs/xfs_qmops.c @@ -28,6 +28,8 @@ #include "xfs_mount.h" #include "xfs_quota.h" #include "xfs_error.h" +#include "xfs_clnt.h" + STATIC struct xfs_dquot * xfs_dqvopchown_default( @@ -47,24 +49,23 @@ xfs_mount_reset_sbqflags(xfs_mount_t *mp) { int error; xfs_trans_t *tp; - unsigned long s; mp->m_qflags = 0; /* * It is OK to look at sb_qflags here in mount path, - * without SB_LOCK. + * without m_sb_lock. */ if (mp->m_sb.sb_qflags == 0) return 0; - s = XFS_SB_LOCK(mp); + spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = 0; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); /* * if the fs is readonly, let the incore superblock run * with quotas off but don't flush the update out to disk */ - if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; #ifdef QUOTADEBUG xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes"); @@ -78,7 +79,7 @@ xfs_mount_reset_sbqflags(xfs_mount_t *mp) return error; } xfs_mod_sb(tp, XFS_SB_QFLAGS); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); return error; } @@ -110,7 +111,7 @@ xfs_noquota_init( return error; } -xfs_qmops_t xfs_qmcore_stub = { +static struct xfs_qmops xfs_qmcore_stub = { .xfs_qminit = (xfs_qminit_t) xfs_noquota_init, .xfs_qmdone = (xfs_qmdone_t) fs_noerr, .xfs_qmmount = (xfs_qmmount_t) fs_noerr, @@ -124,4 +125,30 @@ xfs_qmops_t xfs_qmcore_stub = { .xfs_dqvoprename = (xfs_dqvoprename_t) fs_noerr, .xfs_dqvopchown = xfs_dqvopchown_default, .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, + .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval, + .xfs_dqsync = (xfs_dqsync_t) fs_noerr, + .xfs_quotactl = (xfs_quotactl_t) fs_nosys, }; + +int +xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args) +{ + if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) { +#ifdef CONFIG_XFS_QUOTA + mp->m_qm_ops = &xfs_qmcore_xfs; +#else + cmn_err(CE_WARN, + "XFS: qouta support not available in this kernel."); + return EINVAL; +#endif + } else { + mp->m_qm_ops = &xfs_qmcore_stub; + } + + return 0; +} + +void +xfs_qmops_put(struct xfs_mount *mp) +{ +} diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index acb853b..12c4ec7 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -154,10 +154,11 @@ typedef struct xfs_qoff_logformat { #define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) #define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_QUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD) #define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) #define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) #define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) /* * Incore only flags for quotaoff - these bits get cleared when quota(s) @@ -281,8 +282,6 @@ typedef struct xfs_qoff_logformat { XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ XFS_GQUOTA_ACCT) -#define XFS_MOUNT_QUOTA_MASK (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) /* @@ -331,12 +330,12 @@ typedef struct xfs_dqtrxops { } xfs_dqtrxops_t; #define XFS_DQTRXOP(mp, tp, op, args...) \ - ((mp)->m_qm_ops.xfs_dqtrxops ? \ - ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : 0) + ((mp)->m_qm_ops->xfs_dqtrxops ? \ + ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : 0) #define XFS_DQTRXOP_VOID(mp, tp, op, args...) \ - ((mp)->m_qm_ops.xfs_dqtrxops ? \ - ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : (void)0) + ((mp)->m_qm_ops->xfs_dqtrxops ? \ + ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : (void)0) #define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \ XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp) @@ -365,7 +364,7 @@ typedef struct xfs_dqtrxops { extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); extern int xfs_mount_reset_sbqflags(struct xfs_mount *); -extern struct bhv_module_vfsops xfs_qmops; +extern struct xfs_qmops xfs_qmcore_xfs; #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index d98171d..c903130 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -22,6 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -35,9 +36,9 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_refcache.h" #include "xfs_utils.h" #include "xfs_trans_space.h" +#include "xfs_vnodeops.h" /* @@ -54,95 +55,32 @@ xfs_rename_unlock4( xfs_iunlock(i_tab[0], lock_mode); for (i = 1; i < 4; i++) { - if (i_tab[i] == NULL) { + if (i_tab[i] == NULL) break; - } + /* * Watch out for duplicate entries in the table. */ - if (i_tab[i] != i_tab[i-1]) { + if (i_tab[i] != i_tab[i-1]) xfs_iunlock(i_tab[i], lock_mode); - } } } -#ifdef DEBUG -int xfs_rename_skip, xfs_rename_nskip; -#endif - /* - * The following routine will acquire the locks required for a rename - * operation. The code understands the semantics of renames and will - * validate that name1 exists under dp1 & that name2 may or may not - * exist under dp2. - * - * We are renaming dp1/name1 to dp2/name2. - * - * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success. + * Enter all inodes for a rename transaction into a sorted array. */ -STATIC int -xfs_lock_for_rename( - xfs_inode_t *dp1, /* old (source) directory inode */ - xfs_inode_t *dp2, /* new (target) directory inode */ - bhv_vname_t *vname1,/* old entry name */ - bhv_vname_t *vname2,/* new entry name */ - xfs_inode_t **ipp1, /* inode of old entry */ - xfs_inode_t **ipp2, /* inode of new entry, if it +STATIC void +xfs_sort_for_rename( + xfs_inode_t *dp1, /* in: old (source) directory inode */ + xfs_inode_t *dp2, /* in: new (target) directory inode */ + xfs_inode_t *ip1, /* in: inode of old entry */ + xfs_inode_t *ip2, /* in: inode of new entry, if it already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* array of inode returned, sorted */ - int *num_inodes) /* number of inodes in array */ + xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ + int *num_inodes) /* out: number of inodes in array */ { - xfs_inode_t *ip1, *ip2, *temp; - xfs_ino_t inum1, inum2; - int error; + xfs_inode_t *temp; int i, j; - uint lock_mode; - int diff_dirs = (dp1 != dp2); - - ip2 = NULL; - - /* - * First, find out the current inums of the entries so that we - * can determine the initial locking order. We'll have to - * sanity check stuff after all the locks have been acquired - * to see if we still have the right inodes, directories, etc. - */ - lock_mode = xfs_ilock_map_shared(dp1); - error = xfs_get_dir_entry(vname1, &ip1); - if (error) { - xfs_iunlock_map_shared(dp1, lock_mode); - return error; - } - - inum1 = ip1->i_ino; - - ASSERT(ip1); - ITRACE(ip1); - - /* - * Unlock dp1 and lock dp2 if they are different. - */ - - if (diff_dirs) { - xfs_iunlock_map_shared(dp1, lock_mode); - lock_mode = xfs_ilock_map_shared(dp2); - } - - error = xfs_dir_lookup_int(XFS_ITOBHV(dp2), lock_mode, - vname2, &inum2, &ip2); - if (error == ENOENT) { /* target does not need to exist. */ - inum2 = 0; - } else if (error) { - /* - * If dp2 and dp1 are the same, the next line unlocks dp1. - * Got it? - */ - xfs_iunlock_map_shared(dp2, lock_mode); - IRELE (ip1); - return error; - } else { - ITRACE(ip2); - } /* * i_tab contains a list of pointers to inodes. We initialize @@ -154,20 +92,20 @@ xfs_lock_for_rename( i_tab[0] = dp1; i_tab[1] = dp2; i_tab[2] = ip1; - if (inum2 == 0) { - *num_inodes = 3; - i_tab[3] = NULL; - } else { + if (ip2) { *num_inodes = 4; i_tab[3] = ip2; + } else { + *num_inodes = 3; + i_tab[3] = NULL; } /* * Sort the elements via bubble sort. (Remember, there are at * most 4 elements to sort, so this is adequate.) */ - for (i=0; i < *num_inodes; i++) { - for (j=1; j < *num_inodes; j++) { + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { temp = i_tab[j]; i_tab[j] = i_tab[j-1]; @@ -175,45 +113,6 @@ xfs_lock_for_rename( } } } - - /* - * We have dp2 locked. If it isn't first, unlock it. - * If it is first, tell xfs_lock_inodes so it can skip it - * when locking. if dp1 == dp2, xfs_lock_inodes will skip both - * since they are equal. xfs_lock_inodes needs all these inodes - * so that it can unlock and retry if there might be a dead-lock - * potential with the log. - */ - - if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) { -#ifdef DEBUG - xfs_rename_skip++; -#endif - xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED); - } else { -#ifdef DEBUG - xfs_rename_nskip++; -#endif - xfs_iunlock_map_shared(dp2, lock_mode); - xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED); - } - - /* - * Set the return value. Null out any unused entries in i_tab. - */ - *ipp1 = *ipp2 = NULL; - for (i=0; i < *num_inodes; i++) { - if (i_tab[i]->i_ino == inum1) { - *ipp1 = i_tab[i]; - } - if (i_tab[i]->i_ino == inum2) { - *ipp2 = i_tab[i]; - } - } - for (;i < 4; i++) { - i_tab[i] = NULL; - } - return 0; } /* @@ -221,15 +120,15 @@ xfs_lock_for_rename( */ int xfs_rename( - bhv_desc_t *src_dir_bdp, - bhv_vname_t *src_vname, - bhv_vnode_t *target_dir_vp, - bhv_vname_t *target_vname, - cred_t *credp) + xfs_inode_t *src_dp, + struct xfs_name *src_name, + xfs_inode_t *src_ip, + xfs_inode_t *target_dp, + struct xfs_name *target_name, + xfs_inode_t *target_ip) { - xfs_trans_t *tp; - xfs_inode_t *src_dp, *target_dp, *src_ip, *target_ip; - xfs_mount_t *mp; + xfs_trans_t *tp = NULL; + xfs_mount_t *mp = src_dp->i_mount; int new_parent; /* moving to a new dir */ int src_is_directory; /* src_name is a directory */ int error; @@ -238,105 +137,45 @@ xfs_rename( int cancel_flags; int committed; xfs_inode_t *inodes[4]; - int target_ip_dropped = 0; /* dropped target_ip link? */ - bhv_vnode_t *src_dir_vp; int spaceres; - int target_link_zero = 0; int num_inodes; - char *src_name = VNAME(src_vname); - char *target_name = VNAME(target_vname); - int src_namelen = VNAMELEN(src_vname); - int target_namelen = VNAMELEN(target_vname); - src_dir_vp = BHV_TO_VNODE(src_dir_bdp); - vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address); - vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address); + xfs_itrace_entry(src_dp); + xfs_itrace_entry(target_dp); - /* - * Find the XFS behavior descriptor for the target directory - * vnode since it was not handed to us. - */ - target_dp = xfs_vtoi(target_dir_vp); - if (target_dp == NULL) { - return XFS_ERROR(EXDEV); - } - - src_dp = XFS_BHVTOI(src_dir_bdp); - mp = src_dp->i_mount; - - if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) || - DM_EVENT_ENABLED(target_dir_vp->v_vfsp, - target_dp, DM_EVENT_RENAME)) { + if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) || + DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME, - src_dir_vp, DM_RIGHT_NULL, - target_dir_vp, DM_RIGHT_NULL, - src_name, target_name, + src_dp, DM_RIGHT_NULL, + target_dp, DM_RIGHT_NULL, + src_name->name, target_name->name, 0, 0, 0); - if (error) { + if (error) return error; - } } /* Return through std_return after this point. */ - /* - * Lock all the participating inodes. Depending upon whether - * the target_name exists in the target directory, and - * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. - * xfs_lock_for_rename() will return ENOENT if src_name - * does not exist in the source directory. - */ - tp = NULL; - error = xfs_lock_for_rename(src_dp, target_dp, src_vname, - target_vname, &src_ip, &target_ip, inodes, - &num_inodes); - - if (error) { - /* - * We have nothing locked, no inode references, and - * no transaction, so just get out. - */ - goto std_return; - } - - ASSERT(src_ip != NULL); + new_parent = (src_dp != target_dp); + src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); - if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (src_is_directory) { /* * Check for link count overflow on target_dp */ - if (target_ip == NULL && (src_dp != target_dp) && + if (target_ip == NULL && new_parent && target_dp->i_d.di_nlink >= XFS_MAXLINK) { error = XFS_ERROR(EMLINK); - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); - goto rele_return; + goto std_return; } } - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { - error = XFS_ERROR(EXDEV); - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); - goto rele_return; - } - - new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); - - /* - * Drop the locks on our inodes so that we can start the transaction. - */ - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + inodes, &num_inodes); XFS_BMAP_INIT(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen); + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); if (error == ENOSPC) { @@ -346,7 +185,7 @@ xfs_rename( } if (error) { xfs_trans_cancel(tp, 0); - goto rele_return; + goto std_return; } /* @@ -354,13 +193,29 @@ xfs_rename( */ if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) { xfs_trans_cancel(tp, cancel_flags); - goto rele_return; + goto std_return; } /* - * Reacquire the inode locks we dropped above. + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. */ - xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL); + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { + error = XFS_ERROR(EXDEV); + xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, cancel_flags); + goto std_return; + } /* * Join all the inodes to the transaction. From this point on, @@ -370,19 +225,19 @@ xfs_rename( * them when they unlock the inodes. Also, we need to be careful * not to add an inode to the transaction more than once. */ - VN_HOLD(src_dir_vp); + IHOLD(src_dp); xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) { - VN_HOLD(target_dir_vp); + IHOLD(target_dp); xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); } - if ((src_ip != src_dp) && (src_ip != target_dp)) { - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); - } - if ((target_ip != NULL) && - (target_ip != src_ip) && - (target_ip != src_dp) && - (target_ip != target_dp)) { + + IHOLD(src_ip); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + + if (target_ip) { + IHOLD(target_ip); xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); } @@ -394,9 +249,8 @@ xfs_rename( * If there's no space reservation, check the entry will * fit before actually inserting it. */ - if (spaceres == 0 && - (error = xfs_dir_canenter(tp, target_dp, target_name, - target_namelen))) + error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); + if (error) goto error_return; /* * If target does not exist and the rename crosses @@ -404,8 +258,8 @@ xfs_rename( * to account for the ".." reference from the new entry. */ error = xfs_dir_createname(tp, target_dp, target_name, - target_namelen, src_ip->i_ino, - &first_block, &free_list, spaceres); + src_ip->i_ino, &first_block, + &free_list, spaceres); if (error == ENOSPC) goto error_return; if (error) @@ -444,7 +298,7 @@ xfs_rename( * name at the destination directory, remove it first. */ error = xfs_dir_replace(tp, target_dp, target_name, - target_namelen, src_ip->i_ino, + src_ip->i_ino, &first_block, &free_list, spaceres); if (error) goto abort_return; @@ -457,7 +311,6 @@ xfs_rename( error = xfs_droplink(tp, target_ip); if (error) goto abort_return; - target_ip_dropped = 1; if (src_is_directory) { /* @@ -467,10 +320,6 @@ xfs_rename( if (error) goto abort_return; } - - /* Do this test while we still hold the locks */ - target_link_zero = (target_ip)->i_d.di_nlink==0; - } /* target_ip != NULL */ /* @@ -481,27 +330,24 @@ xfs_rename( * Rewrite the ".." entry to point to the new * directory. */ - error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino, + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, &first_block, &free_list, spaceres); ASSERT(error != EEXIST); if (error) goto abort_return; - xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - } else { - /* - * We always want to hit the ctime on the source inode. - * We do it in the if clause above for the 'new_parent && - * src_is_directory' case, and here we get all the other - * cases. This isn't strictly required by the standards - * since the source inode isn't really being changed, - * but old unix file systems did it and some incremental - * backup programs won't work without it. - */ - xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); } /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); + + /* * Adjust the link count on src_dp. This is necessary when * renaming a directory, either within one parent when * the target existed, or across two parent directories. @@ -517,8 +363,8 @@ xfs_rename( goto abort_return; } - error = xfs_dir_removename(tp, src_dp, src_name, src_namelen, - src_ip->i_ino, &first_block, &free_list, spaceres); + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); if (error) goto abort_return; xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -536,15 +382,6 @@ xfs_rename( } /* - * If there was a target inode, take an extra reference on - * it here so that it doesn't go to xfs_inactive() from - * within the commit. - */ - if (target_ip != NULL) { - IHOLD(target_ip); - } - - /* * If this is a synchronous mount, make sure that the * rename transaction goes to disk before returning to * the user. @@ -553,30 +390,11 @@ xfs_rename( xfs_trans_set_sync(tp); } - /* - * Take refs. for vop_link_removed calls below. No need to worry - * about directory refs. because the caller holds them. - * - * Do holds before the xfs_bmap_finish since it might rele them down - * to zero. - */ - - if (target_ip_dropped) - IHOLD(target_ip); - IHOLD(src_ip); - - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); - if (target_ip != NULL) { - IRELE(target_ip); - } - if (target_ip_dropped) { - IRELE(target_ip); - } - IRELE(src_ip); goto std_return; } @@ -584,32 +402,17 @@ xfs_rename( * trans_commit will unlock src_ip, target_ip & decrement * the vnode references. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (target_ip != NULL) { - xfs_refcache_purge_ip(target_ip); - IRELE(target_ip); - } - /* - * Let interposed file systems know about removed links. - */ - if (target_ip_dropped) { - bhv_vop_link_removed(XFS_ITOV(target_ip), target_dir_vp, - target_link_zero); - IRELE(target_ip); - } - - IRELE(src_ip); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); /* Fall through to std_return with error = 0 or errno from * xfs_trans_commit */ std_return: - if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) || - DM_EVENT_ENABLED(target_dir_vp->v_vfsp, - target_dp, DM_EVENT_POSTRENAME)) { + if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) || + DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) { (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME, - src_dir_vp, DM_RIGHT_NULL, - target_dir_vp, DM_RIGHT_NULL, - src_name, target_name, + src_dp, DM_RIGHT_NULL, + target_dp, DM_RIGHT_NULL, + src_name->name, target_name->name, 0, error, 0); } return error; @@ -621,11 +424,4 @@ std_return: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, cancel_flags); goto std_return; - - rele_return: - IRELE(src_ip); - if (target_ip != NULL) { - IRELE(target_ip); - } - goto std_return; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5a0b678..e2f68de 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -44,6 +44,7 @@ #include "xfs_rw.h" #include "xfs_inode_item.h" #include "xfs_trans_space.h" +#include "xfs_utils.h" /* @@ -73,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int, */ /* - * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set. - */ -STATIC int -xfs_lowbit32( - __uint32_t v) -{ - if (v) - return ffs(v) - 1; - return -1; -} - -/* * Allocate space to the bitmap or summary file, and zero it, for growfs. */ STATIC int /* error */ @@ -123,14 +112,14 @@ xfs_growfs_rt_alloc( XFS_GROWRTALLOC_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_PERM_LOG_COUNT))) - goto error_exit; + goto error_cancel; cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* * Lock the inode. */ if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + goto error_cancel; XFS_BMAP_INIT(&flist, &firstblock); /* * Allocate blocks to the bitmap file. @@ -143,14 +132,16 @@ xfs_growfs_rt_alloc( if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) - goto error_exit; + goto error_cancel; /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, firstblock, &committed); + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) - goto error_exit; - xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + goto error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. @@ -165,13 +156,13 @@ xfs_growfs_rt_alloc( */ if ((error = xfs_trans_reserve(tp, 0, XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) - goto error_exit; + goto error_cancel; /* * Lock the bitmap inode. */ if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + goto error_cancel; /* * Get a buffer for the block. */ @@ -180,14 +171,16 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0); if (bp == NULL) { error = XFS_ERROR(EIO); - goto error_exit; + goto error_cancel; } memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* * Commit the transaction. */ - xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); + if (error) + goto error; } /* * Go on to the next extent, if any. @@ -195,8 +188,9 @@ xfs_growfs_rt_alloc( oblocks = map.br_startoff + map.br_blockcount; } return 0; -error_exit: +error_cancel: xfs_trans_cancel(tp, cancelflags); +error: return error; } @@ -444,6 +438,7 @@ xfs_rtallocate_extent_near( } bbno = XFS_BITTOBLOCK(mp, bno); i = 0; + ASSERT(minlen != 0); log2len = xfs_highbit32(minlen); /* * Loop over all bitmap blocks (bbno + i is current block). @@ -612,6 +607,8 @@ xfs_rtallocate_extent_size( xfs_suminfo_t sum; /* summary information for extents */ ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(maxlen != 0); + /* * Loop over all the levels starting with maxlen. * At each level, look at all the bitmap blocks, to see if there @@ -669,6 +666,9 @@ xfs_rtallocate_extent_size( *rtblock = NULLRTBLOCK; return 0; } + ASSERT(minlen != 0); + ASSERT(maxlen != 0); + /* * Loop over sizes, from maxlen down to minlen. * This time, when we do the allocations, allow smaller ones @@ -913,57 +913,6 @@ xfs_rtcheck_alloc_range( } #endif -#ifdef DEBUG -/* - * Check whether the given block in the bitmap has the given value. - */ -STATIC int /* 1 for matches, 0 for not */ -xfs_rtcheck_bit( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* bit (block) to check */ - int val) /* 1 for free, 0 for allocated */ -{ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* pointer into the buffer */ - /* REFERENCED */ - int error; /* error value */ - xfs_rtword_t wdiff; /* difference between bit & expected */ - int word; /* word number in the buffer */ - xfs_rtword_t wval; /* word value from buffer */ - - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = XFS_BITTOWORD(mp, start); - bit = (int)(start & (XFS_NBWORD - 1)); - wval = bufp[word]; - xfs_trans_brelse(tp, bp); - wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit); - return !wdiff; -} -#endif /* DEBUG */ - -#if 0 -/* - * Check that the given extent (block range) is free already. - */ -STATIC int /* error */ -xfs_rtcheck_free_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* out: 1 for free, 0 for not */ -{ - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - - return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat); -} -#endif - /* * Check that the given range is either all allocated (val = 0) or * all free (val = 1). @@ -1926,6 +1875,7 @@ xfs_growfs_rt( xfs_trans_t *tp; /* transaction pointer */ sbp = &mp->m_sb; + cancelflags = 0; /* * Initial error checking. */ @@ -1933,11 +1883,13 @@ xfs_growfs_rt( (nrblocks = in->newblocks) <= sbp->sb_rblocks || (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + return error; /* * Read in the last block of the device, make sure it exists. */ error = xfs_read_buf(mp, mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), + XFS_FSB_TO_BB(mp, nrblocks - 1), XFS_FSB_TO_BB(mp, 1), 0, &bp); if (error) return error; @@ -1948,7 +1900,7 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); - nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize); + nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); nrextslog = xfs_highbit32(nrextents); nrsumlevels = nrextslog + 1; nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; @@ -1976,7 +1928,10 @@ xfs_growfs_rt( if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_sb.sb_rsumino))) return error; - nmp = NULL; + /* + * Allocate a new (fake) mount/sb. + */ + nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. @@ -1987,10 +1942,6 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { - /* - * Allocate a new (fake) mount/sb. - */ - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); *nmp = *mp; nsbp = &nmp->m_sb; /* @@ -2004,6 +1955,7 @@ xfs_growfs_rt( nsbp->sb_blocksize * nsbp->sb_rextsize); nsbp->sb_rextents = nsbp->sb_rblocks; do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + ASSERT(nsbp->sb_rextents != 0); nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; nrsumsize = @@ -2018,13 +1970,13 @@ xfs_growfs_rt( cancelflags = 0; if ((error = xfs_trans_reserve(tp, 0, XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) - goto error_exit; + break; /* * Lock out other callers by grabbing the bitmap inode lock. */ if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + break; ASSERT(ip == mp->m_rbmip); /* * Update the bitmap inode's size. @@ -2038,7 +1990,7 @@ xfs_growfs_rt( */ if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + break; ASSERT(ip == mp->m_rsumip); /* * Update the summary inode's size. @@ -2053,7 +2005,7 @@ xfs_growfs_rt( mp->m_rsumlevels != nmp->m_rsumlevels) { error = xfs_rtcopy_summary(mp, nmp, tp); if (error) - goto error_exit; + break; } /* * Update superblock fields. @@ -2080,36 +2032,33 @@ xfs_growfs_rt( error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); if (error) - goto error_exit; + break; /* * Mark more blocks free in the superblock. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, nsbp->sb_rextents - sbp->sb_rextents); /* - * Free the fake mp structure. - */ - kmem_free(nmp, sizeof(*nmp)); - nmp = NULL; - /* * Update mp values into the real mp structure. */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - /* - * Commit the transaction. - */ - xfs_trans_commit(tp, 0, NULL); + + error = xfs_trans_commit(tp, 0); + if (error) { + tp = NULL; + break; + } } - return 0; + + if (error && tp) + xfs_trans_cancel(tp, cancelflags); /* - * Error paths come here. + * Free the fake mp structure. */ -error_exit: - if (nmp) - kmem_free(nmp, sizeof(*nmp)); - xfs_trans_cancel(tp, cancelflags); + kmem_free(nmp); + return error; } @@ -2333,7 +2282,7 @@ xfs_rtmount_inodes( ASSERT(sbp->sb_rsumino != NULLFSINO); error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); if (error) { - VN_RELE(XFS_ITOV(mp->m_rbmip)); + IRELE(mp->m_rbmip); return error; } ASSERT(mp->m_rsumip != NULL); @@ -2388,60 +2337,3 @@ xfs_rtpick_extent( *pick = b; return 0; } - -#ifdef DEBUG -/* - * Debug code: print out the value of a range in the bitmap. - */ -void -xfs_rtprint_range( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to print */ - xfs_extlen_t len) /* length to print */ -{ - xfs_extlen_t i; /* block number in the extent */ - - cmn_err(CE_DEBUG, "%Ld: ", (long long)start); - for (i = 0; i < len; i++) - cmn_err(CE_DEBUG, "%d", xfs_rtcheck_bit(mp, tp, start + i, 1)); - cmn_err(CE_DEBUG, "\n"); -} - -/* - * Debug code: print the summary file. - */ -void -xfs_rtprint_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp) /* transaction pointer */ -{ - xfs_suminfo_t c; /* summary data */ - xfs_rtblock_t i; /* bitmap block number */ - int l; /* summary information level */ - int p; /* flag for printed anything */ - xfs_fsblock_t sb; /* summary block number */ - xfs_buf_t *sumbp; /* summary block buffer */ - - sumbp = NULL; - for (l = 0; l < mp->m_rsumlevels; l++) { - for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - (void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c); - if (c) { - if (!p) { - cmn_err(CE_DEBUG, "%Ld-%Ld:", 1LL << l, - XFS_RTMIN((1LL << l) + - ((1LL << l) - 1LL), - mp->m_sb.sb_rextents)); - p = 1; - } - cmn_err(CE_DEBUG, " %Ld:%d", (long long)i, c); - } - } - if (p) - cmn_err(CE_DEBUG, "\n"); - } - if (sumbp) - xfs_trans_brelse(tp, sumbp); -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0e0b4d2..8d8dcd2 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -21,8 +21,6 @@ struct xfs_mount; struct xfs_trans; -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) - /* Min and max rt extent sizes, specified in bytes */ #define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ #define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */ @@ -134,24 +132,6 @@ xfs_rtpick_extent( xfs_rtblock_t *pick); /* result rt extent */ /* - * Debug code: print out the value of a range in the bitmap. - */ -void -xfs_rtprint_range( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to print */ - xfs_extlen_t len); /* length to print */ - -/* - * Debug code: print the summary file. - */ -void -xfs_rtprint_summary( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp); /* transaction pointer */ - -/* * Grow the realtime area of the filesystem. */ int diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index defb2fe..3a82576 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -42,7 +42,6 @@ #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_acl.h" -#include "xfs_mac.h" #include "xfs_error.h" #include "xfs_buf_item.h" #include "xfs_rw.h" @@ -84,7 +83,7 @@ xfs_write_clear_setuid( } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } @@ -127,11 +126,11 @@ xfs_write_sync_logforce( * when we return. */ if (iip && iip->ili_last_lsn) { - xfs_log_force(mp, iip->ili_last_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC); + error = _xfs_log_force(mp, iip->ili_last_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); } else if (xfs_ipincount(ip) > 0) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE | XFS_LOG_SYNC); + error = _xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); } } else { @@ -165,7 +164,7 @@ xfs_write_sync_logforce( xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); } } @@ -179,18 +178,15 @@ xfs_write_sync_logforce( * the shop, make sure that absolutely nothing persistent happens to * this filesystem after this point. */ - void xfs_do_force_shutdown( - bhv_desc_t *bdp, + xfs_mount_t *mp, int flags, char *fname, int lnnum) { int logerror; - xfs_mount_t *mp; - mp = XFS_BHVTOM(bdp); logerror = flags & SHUTDOWN_LOG_IO_ERROR; if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { @@ -318,7 +314,7 @@ xfs_bioerror_relse( * ASYNC buffers. */ XFS_BUF_ERROR(bp, EIO); - XFS_BUF_V_IODONESEMA(bp); + XFS_BUF_FINISH_IOWAIT(bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 188b296..f87db53 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -23,32 +23,6 @@ struct xfs_inode; struct xfs_mount; /* - * Maximum count of bmaps used by read and write paths. - */ -#define XFS_MAX_RW_NBMAPS 4 - -/* - * Counts of readahead buffers to use based on physical memory size. - * None of these should be more than XFS_MAX_RW_NBMAPS. - */ -#define XFS_RW_NREADAHEAD_16MB 2 -#define XFS_RW_NREADAHEAD_32MB 3 -#define XFS_RW_NREADAHEAD_K32 4 -#define XFS_RW_NREADAHEAD_K64 4 - -/* - * Maximum size of a buffer that we\'ll map. Making this - * too big will degrade performance due to the number of - * pages which need to be gathered. Making it too small - * will prevent us from doing large I/O\'s to hardware that - * needs it. - * - * This is currently set to 512 KB. - */ -#define XFS_MAX_BMAP_LEN_BB 1024 -#define XFS_MAX_BMAP_LEN_BYTES 524288 - -/* * Convert the given file system block to a disk block. * We have to treat it differently based on whether the * file is a real time file or not, because the bmap code @@ -58,17 +32,37 @@ struct xfs_mount; static inline xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { - return (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) ? \ + return (XFS_IS_REALTIME_INODE(ip) ? \ (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); } -#define XFS_FSB_TO_DB_IO(io,fsb) xfs_fsb_to_db_io(io,fsb) -static inline xfs_daddr_t -xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb) + +/* + * Flags for xfs_free_eofblocks + */ +#define XFS_FREE_EOF_LOCK (1<<0) +#define XFS_FREE_EOF_NOLOCK (1<<1) + + +/* + * helper function to extract extent size hint from inode + */ +STATIC_INLINE xfs_extlen_t +xfs_get_extsz_hint( + xfs_inode_t *ip) { - return (((io)->io_flags & XFS_IOCORE_RT) ? \ - XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((io)->io_mount, (fsb))); + xfs_extlen_t extsz; + + if (unlikely(XFS_IS_REALTIME_INODE(ip))) { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize + : ip->i_mount->m_sb.sb_rextsize; + ASSERT(extsz); + } else { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize : 0; + } + return extsz; } /* @@ -88,13 +82,7 @@ extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, /* * Prototypes for functions in xfs_vnodeops.c. */ -extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); -extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); -extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags, - cred_t *credp); -extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf, - xfs_off_t offset, cred_t *credp, int flags); -extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state, - cred_t *credp); +extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, + int flags); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index bf168a9..3f8cf15 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -46,10 +46,12 @@ struct xfs_mount; #define XFS_SB_VERSION_SECTORBIT 0x0800 #define XFS_SB_VERSION_EXTFLGBIT 0x1000 #define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ #define XFS_SB_VERSION_MOREBITSBIT 0x8000 #define XFS_SB_VERSION_OKSASHFBITS \ (XFS_SB_VERSION_EXTFLGBIT | \ - XFS_SB_VERSION_DIRV2BIT) + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_BORGBIT) #define XFS_SB_VERSION_OKREALFBITS \ (XFS_SB_VERSION_ATTRBIT | \ XFS_SB_VERSION_NLINKBIT | \ @@ -60,10 +62,6 @@ struct xfs_mount; XFS_SB_VERSION_LOGV2BIT | \ XFS_SB_VERSION_SECTORBIT | \ XFS_SB_VERSION_MOREBITSBIT) -#define XFS_SB_VERSION_OKSASHBITS \ - (XFS_SB_VERSION_NUMBITS | \ - XFS_SB_VERSION_REALFBITS | \ - XFS_SB_VERSION_OKSASHFBITS) #define XFS_SB_VERSION_OKREALBITS \ (XFS_SB_VERSION_NUMBITS | \ XFS_SB_VERSION_OKREALFBITS | \ @@ -78,23 +76,24 @@ struct xfs_mount; */ #define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ #define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 -#define XFS_SB_VERSION2_RESERVED2BIT 0x00000002 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ -#define XFS_SB_VERSION2_SASHFBITS 0xff000000 /* Mask: features that - require changing - PROM and SASH */ #define XFS_SB_VERSION2_OKREALFBITS \ - (XFS_SB_VERSION2_ATTR2BIT) + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT) #define XFS_SB_VERSION2_OKSASHFBITS \ (0) #define XFS_SB_VERSION2_OKREALBITS \ (XFS_SB_VERSION2_OKREALFBITS | \ XFS_SB_VERSION2_OKSASHFBITS ) -typedef struct xfs_sb -{ +/* + * Superblock - in core version. Must match the ondisk version below. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_sb { __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ __uint32_t sb_blocksize; /* logical block size, bytes */ xfs_drfsbno_t sb_dblocks; /* number of data blocks */ @@ -149,9 +148,89 @@ typedef struct xfs_sb __uint16_t sb_logsectsize; /* sector size for the log, bytes */ __uint32_t sb_logsunit; /* stripe unit size for the log */ __uint32_t sb_features2; /* additional feature bits */ + + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __uint32_t sb_bad_features2; + + /* must be padded to 64 bit alignment */ } xfs_sb_t; /* + * Superblock - on disk version. Must match the in core version above. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_dsb { + __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __be32 sb_blocksize; /* logical block size, bytes */ + __be64 sb_dblocks; /* number of data blocks */ + __be64 sb_rblocks; /* number of realtime blocks */ + __be64 sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + __be64 sb_logstart; /* starting block of log if internal */ + __be64 sb_rootino; /* root inode number */ + __be64 sb_rbmino; /* bitmap inode for realtime extents */ + __be64 sb_rsumino; /* summary inode for rt bitmap */ + __be32 sb_rextsize; /* realtime extent size, blocks */ + __be32 sb_agblocks; /* size of an allocation group */ + __be32 sb_agcount; /* number of allocation groups */ + __be32 sb_rbmblocks; /* number of rt bitmap blocks */ + __be32 sb_logblocks; /* number of log blocks */ + __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ + __be16 sb_sectsize; /* volume sector size, bytes */ + __be16 sb_inodesize; /* inode size, bytes */ + __be16 sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __u8 sb_blocklog; /* log2 of sb_blocksize */ + __u8 sb_sectlog; /* log2 of sb_sectsize */ + __u8 sb_inodelog; /* log2 of sb_inodesize */ + __u8 sb_inopblog; /* log2 of sb_inopblock */ + __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __u8 sb_rextslog; /* log2 of sb_rextents */ + __u8 sb_inprogress; /* mkfs is in progress, don't mount */ + __u8 sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __be64 sb_icount; /* allocated inodes */ + __be64 sb_ifree; /* free inodes */ + __be64 sb_fdblocks; /* free data blocks */ + __be64 sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + __be64 sb_uquotino; /* user quota inode */ + __be64 sb_gquotino; /* group quota inode */ + __be16 sb_qflags; /* quota flags */ + __u8 sb_flags; /* misc. flags */ + __u8 sb_shared_vn; /* shared version number */ + __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __be32 sb_unit; /* stripe or raid unit */ + __be32 sb_width; /* stripe or raid width */ + __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ + __u8 sb_logsectlog; /* log2 of the log sector size */ + __be16 sb_logsectsize; /* sector size for the log, bytes */ + __be32 sb_logsunit; /* stripe unit size for the log */ + __be32 sb_features2; /* additional feature bits */ + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __be32 sb_bad_features2; + + /* must be padded to 64 bit alignment */ +} xfs_dsb_t; + +/* * Sequence number values for the fields. */ typedef enum { @@ -167,7 +246,7 @@ typedef enum { XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FIELDCOUNT } xfs_sb_field_t; @@ -188,14 +267,19 @@ typedef enum { #define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) #define XFS_SB_UNIT XFS_SB_MVAL(UNIT) #define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) +#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) +#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) +#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) #define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) +#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) #define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) #define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) #define XFS_SB_MOD_BITS \ (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_FEATURES2) + XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ + XFS_SB_BAD_FEATURES2) /* @@ -212,7 +296,6 @@ typedef enum { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -#define XFS_SB_GOOD_VERSION(sbp) xfs_sb_good_version(sbp) #ifdef __KERNEL__ static inline int xfs_sb_good_version(xfs_sb_t *sbp) { @@ -238,13 +321,15 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp) } #endif /* __KERNEL__ */ -#define XFS_SB_GOOD_SASH_VERSION(sbp) \ - ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \ - ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS))) +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp) +{ + return (sbp->sb_bad_features2 != sbp->sb_features2); +} -#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v) static inline unsigned xfs_sb_version_tonew(unsigned v) { return ((((v) == XFS_SB_VERSION_1) ? \ @@ -255,7 +340,6 @@ static inline unsigned xfs_sb_version_tonew(unsigned v) XFS_SB_VERSION_4); } -#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v) static inline unsigned xfs_sb_version_toold(unsigned v) { return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \ @@ -267,7 +351,6 @@ static inline unsigned xfs_sb_version_toold(unsigned v) XFS_SB_VERSION_1))); } -#define XFS_SB_VERSION_HASATTR(sbp) xfs_sb_version_hasattr(sbp) static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) { return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \ @@ -276,7 +359,6 @@ static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); } -#define XFS_SB_VERSION_ADDATTR(sbp) xfs_sb_version_addattr(sbp) static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) { (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \ @@ -286,7 +368,6 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))); } -#define XFS_SB_VERSION_HASNLINK(sbp) xfs_sb_version_hasnlink(sbp) static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) { return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ @@ -294,7 +375,6 @@ static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); } -#define XFS_SB_VERSION_ADDNLINK(sbp) xfs_sb_version_addnlink(sbp) static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) { (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \ @@ -302,115 +382,69 @@ static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)); } -#define XFS_SB_VERSION_HASQUOTA(sbp) xfs_sb_version_hasquota(sbp) static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } -#define XFS_SB_VERSION_ADDQUOTA(sbp) xfs_sb_version_addquota(sbp) static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) { (sbp)->sb_versionnum = \ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \ ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \ - (XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \ + (xfs_sb_version_tonew((sbp)->sb_versionnum) | \ XFS_SB_VERSION_QUOTABIT)); } -#define XFS_SB_VERSION_HASALIGN(sbp) xfs_sb_version_hasalign(sbp) static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); } -#define XFS_SB_VERSION_SUBALIGN(sbp) xfs_sb_version_subalign(sbp) -static inline void xfs_sb_version_subalign(xfs_sb_t *sbp) -{ - (sbp)->sb_versionnum = \ - XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT); -} - -#define XFS_SB_VERSION_HASDALIGN(sbp) xfs_sb_version_hasdalign(sbp) static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } -#define XFS_SB_VERSION_ADDDALIGN(sbp) xfs_sb_version_adddalign(sbp) -static inline int xfs_sb_version_adddalign(xfs_sb_t *sbp) -{ - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT); -} - -#define XFS_SB_VERSION_HASSHARED(sbp) xfs_sb_version_hasshared(sbp) static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); } -#define XFS_SB_VERSION_ADDSHARED(sbp) xfs_sb_version_addshared(sbp) -static inline int xfs_sb_version_addshared(xfs_sb_t *sbp) -{ - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT); -} - -#define XFS_SB_VERSION_SUBSHARED(sbp) xfs_sb_version_subshared(sbp) -static inline int xfs_sb_version_subshared(xfs_sb_t *sbp) -{ - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT); -} - -#define XFS_SB_VERSION_HASDIRV2(sbp) xfs_sb_version_hasdirv2(sbp) static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); } -#define XFS_SB_VERSION_HASLOGV2(sbp) xfs_sb_version_haslogv2(sbp) static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -#define XFS_SB_VERSION_HASEXTFLGBIT(sbp) xfs_sb_version_hasextflgbit(sbp) static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); } -#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp) xfs_sb_version_addextflgbit(sbp) -static inline int xfs_sb_version_addextflgbit(xfs_sb_t *sbp) -{ - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT); -} - -#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp) xfs_sb_version_subextflgbit(sbp) -static inline int xfs_sb_version_subextflgbit(xfs_sb_t *sbp) +static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) { - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } -#define XFS_SB_VERSION_HASSECTOR(sbp) xfs_sb_version_hassector(sbp) -static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) +static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT); + (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } -#define XFS_SB_VERSION_HASMOREBITS(sbp) xfs_sb_version_hasmorebits(sbp) static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ @@ -423,18 +457,22 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro: * * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp) - * ((XFS_SB_VERSION_HASMOREBITS(sbp) && + * ((xfs_sb_version_hasmorebits(sbp) && * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) */ -#define XFS_SB_VERSION_HASATTR2(sbp) xfs_sb_version_hasattr2(sbp) +static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) +{ + return (xfs_sb_version_hasmorebits(sbp) && \ + ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); +} + static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_HASMOREBITS(sbp)) && \ + return (xfs_sb_version_hasmorebits(sbp)) && \ ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); } -#define XFS_SB_VERSION_ADDATTR2(sbp) xfs_sb_version_addattr2(sbp) static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) { ((sbp)->sb_versionnum = \ @@ -443,13 +481,20 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT))); } +static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) +{ + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; +} + /* * end of superblock version macros */ #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_sb_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ @@ -461,15 +506,6 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) * File system sector to basic block conversions. */ #define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) -#define XFS_BB_TO_FSS(mp,bb) \ - (((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log) -#define XFS_BB_TO_FSST(mp,bb) ((bb) >> (mp)->m_sectbb_log) - -/* - * File system sector to byte conversions. - */ -#define XFS_FSS_TO_B(mp,sectno) ((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog) -#define XFS_B_TO_FSST(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog) /* * File system block to basic block conversions. diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ee2721e..4e1c22a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -43,6 +43,7 @@ #include "xfs_quota.h" #include "xfs_trans_priv.h" #include "xfs_trans_space.h" +#include "xfs_inode_item.h" STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); @@ -234,7 +235,7 @@ xfs_trans_alloc( xfs_mount_t *mp, uint type) { - vfs_wait_for_freeze(XFS_MTOVFS(mp), SB_FREEZE_TRANS); + xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); return _xfs_trans_alloc(mp, type); } @@ -253,7 +254,7 @@ _xfs_trans_alloc( tp->t_mountp = mp; tp->t_items_free = XFS_LIC_NUM_SLOTS; tp->t_busy_free = XFS_LBC_NUM_SLOTS; - XFS_LIC_INIT(&(tp->t_items)); + xfs_lic_init(&(tp->t_items)); XFS_LBC_INIT(&(tp->t_busy)); return tp; } @@ -282,7 +283,7 @@ xfs_trans_dup( ntp->t_mountp = tp->t_mountp; ntp->t_items_free = XFS_LIC_NUM_SLOTS; ntp->t_busy_free = XFS_LBC_NUM_SLOTS; - XFS_LIC_INIT(&(ntp->t_items)); + xfs_lic_init(&(ntp->t_items)); XFS_LBC_INIT(&(ntp->t_busy)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -339,7 +340,7 @@ xfs_trans_reserve( */ if (blocks > 0) { error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, - -blocks, rsvd); + -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); return (XFS_ERROR(ENOSPC)); @@ -380,7 +381,7 @@ xfs_trans_reserve( */ if (rtextents > 0) { error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, - -rtextents, rsvd); + -((int64_t)rtextents), rsvd); if (error) { error = XFS_ERROR(ENOSPC); goto undo_log; @@ -410,7 +411,7 @@ undo_log: undo_blocks: if (blocks > 0) { (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, - blocks, rsvd); + (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -427,20 +428,34 @@ undo_blocks: * * Mark the transaction structure to indicate that the superblock * needs to be updated before committing. + * + * Because we may not be keeping track of allocated/free inodes and + * used filesystem blocks in the superblock, we do not mark the + * superblock dirty in this transaction if we modify these fields. + * We still need to update the transaction deltas so that they get + * applied to the incore superblock, but we don't want them to + * cause the superblock to get locked and logged if these are the + * only fields in the superblock that the transaction modifies. */ void xfs_trans_mod_sb( xfs_trans_t *tp, uint field, - long delta) + int64_t delta) { + uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY); + xfs_mount_t *mp = tp->t_mountp; switch (field) { case XFS_TRANS_SB_ICOUNT: tp->t_icount_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_IFREE: tp->t_ifree_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FDBLOCKS: /* @@ -453,6 +468,8 @@ xfs_trans_mod_sb( ASSERT(tp->t_blk_res_used <= tp->t_blk_res); } tp->t_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_RES_FDBLOCKS: /* @@ -462,6 +479,8 @@ xfs_trans_mod_sb( */ ASSERT(delta < 0); tp->t_res_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FREXTENTS: /* @@ -515,7 +534,7 @@ xfs_trans_mod_sb( return; } - tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY); + tp->t_flags |= flags; } /* @@ -530,7 +549,7 @@ STATIC void xfs_trans_apply_sb_deltas( xfs_trans_t *tp) { - xfs_sb_t *sbp; + xfs_dsb_t *sbp; xfs_buf_t *bp; int whole = 0; @@ -544,56 +563,55 @@ xfs_trans_apply_sb_deltas( (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + tp->t_ag_btree_delta)); - if (tp->t_icount_delta != 0) { - INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta); - } - if (tp->t_ifree_delta != 0) { - INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta); + /* + * Only update the superblock counters if we are logging them + */ + if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (tp->t_icount_delta) + be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); + if (tp->t_ifree_delta) + be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta); + if (tp->t_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta); + if (tp->t_res_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); } - if (tp->t_fdblocks_delta != 0) { - INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta); - } - if (tp->t_res_fdblocks_delta != 0) { - INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta); - } + if (tp->t_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); + if (tp->t_res_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); - if (tp->t_frextents_delta != 0) { - INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_frextents_delta); - } - if (tp->t_res_frextents_delta != 0) { - INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_res_frextents_delta); - } - if (tp->t_dblocks_delta != 0) { - INT_MOD(sbp->sb_dblocks, ARCH_CONVERT, tp->t_dblocks_delta); + if (tp->t_dblocks_delta) { + be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); whole = 1; } - if (tp->t_agcount_delta != 0) { - INT_MOD(sbp->sb_agcount, ARCH_CONVERT, tp->t_agcount_delta); + if (tp->t_agcount_delta) { + be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta); whole = 1; } - if (tp->t_imaxpct_delta != 0) { - INT_MOD(sbp->sb_imax_pct, ARCH_CONVERT, tp->t_imaxpct_delta); + if (tp->t_imaxpct_delta) { + sbp->sb_imax_pct += tp->t_imaxpct_delta; whole = 1; } - if (tp->t_rextsize_delta != 0) { - INT_MOD(sbp->sb_rextsize, ARCH_CONVERT, tp->t_rextsize_delta); + if (tp->t_rextsize_delta) { + be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta); whole = 1; } - if (tp->t_rbmblocks_delta != 0) { - INT_MOD(sbp->sb_rbmblocks, ARCH_CONVERT, tp->t_rbmblocks_delta); + if (tp->t_rbmblocks_delta) { + be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta); whole = 1; } - if (tp->t_rblocks_delta != 0) { - INT_MOD(sbp->sb_rblocks, ARCH_CONVERT, tp->t_rblocks_delta); + if (tp->t_rblocks_delta) { + be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); whole = 1; } - if (tp->t_rextents_delta != 0) { - INT_MOD(sbp->sb_rextents, ARCH_CONVERT, tp->t_rextents_delta); + if (tp->t_rextents_delta) { + be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta); whole = 1; } - if (tp->t_rextslog_delta != 0) { - INT_MOD(sbp->sb_rextslog, ARCH_CONVERT, tp->t_rextslog_delta); + if (tp->t_rextslog_delta) { + sbp->sb_rextslog += tp->t_rextslog_delta; whole = 1; } @@ -601,25 +619,37 @@ xfs_trans_apply_sb_deltas( /* * Log the whole thing, the fields are noncontiguous. */ - xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1); + xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1); else /* * Since all the modifiable fields are contiguous, we * can get away with this. */ - xfs_trans_log_buf(tp, bp, offsetof(xfs_sb_t, sb_icount), - offsetof(xfs_sb_t, sb_frextents) + + xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), + offsetof(xfs_dsb_t, sb_frextents) + sizeof(sbp->sb_frextents) - 1); - XFS_MTOVFS(tp->t_mountp)->vfs_super->s_dirt = 1; + tp->t_mountp->m_super->s_dirt = 1; } /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused - * reservations and apply superblock counter changes to the in-core - * superblock. + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations + * and apply superblock counter changes to the in-core superblock. The + * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT + * applied to the in-core superblock. The idea is that that has already been + * done. * * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). + * However, we have to ensure that we only modify each superblock field only + * once because the application of the delta values may not be atomic. That can + * lead to ENOSPC races occurring if we have two separate modifcations of the + * free space counter to put back the entire reservation and then take away + * what we used. + * + * If we are not logging superblock counters, then the inode allocated/free and + * used block counts are not updated in the on disk superblock. In this case, + * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we + * still need to update the incore superblock with the changes. */ STATIC void xfs_trans_unreserve_and_mod_sb( @@ -627,98 +657,100 @@ xfs_trans_unreserve_and_mod_sb( { xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ xfs_mod_sb_t *msbp; + xfs_mount_t *mp = tp->t_mountp; /* REFERENCED */ int error; int rsvd; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; msbp = msb; rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - /* - * Release any reserved blocks. Any that were allocated - * will be taken back again by fdblocks_delta below. - */ - if (tp->t_blk_res > 0) { + /* calculate free blocks delta */ + if (tp->t_blk_res > 0) + blkdelta = tp->t_blk_res; + + if ((tp->t_fdblocks_delta != 0) && + (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY))) + blkdelta += tp->t_fdblocks_delta; + + if (blkdelta != 0) { msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = tp->t_blk_res; + msbp->msb_delta = blkdelta; msbp++; } - /* - * Release any reserved real time extents . Any that were - * allocated will be taken back again by frextents_delta below. - */ - if (tp->t_rtx_res > 0) { + /* calculate free realtime extents delta */ + if (tp->t_rtx_res > 0) + rtxdelta = tp->t_rtx_res; + + if ((tp->t_frextents_delta != 0) && + (tp->t_flags & XFS_TRANS_SB_DIRTY)) + rtxdelta += tp->t_frextents_delta; + + if (rtxdelta != 0) { msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = tp->t_rtx_res; + msbp->msb_delta = rtxdelta; msbp++; } - /* - * Apply any superblock modifications to the in-core version. - * The t_res_fdblocks_delta and t_res_frextents_delta fields are - * explicitly NOT applied to the in-core superblock. - * The idea is that that has already been done. - */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + /* apply remaining deltas */ + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY)) { if (tp->t_icount_delta != 0) { msbp->msb_field = XFS_SBS_ICOUNT; - msbp->msb_delta = (int)tp->t_icount_delta; + msbp->msb_delta = tp->t_icount_delta; msbp++; } if (tp->t_ifree_delta != 0) { msbp->msb_field = XFS_SBS_IFREE; - msbp->msb_delta = (int)tp->t_ifree_delta; - msbp++; - } - if (tp->t_fdblocks_delta != 0) { - msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = (int)tp->t_fdblocks_delta; - msbp++; - } - if (tp->t_frextents_delta != 0) { - msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = (int)tp->t_frextents_delta; + msbp->msb_delta = tp->t_ifree_delta; msbp++; } + } + + if (tp->t_flags & XFS_TRANS_SB_DIRTY) { if (tp->t_dblocks_delta != 0) { msbp->msb_field = XFS_SBS_DBLOCKS; - msbp->msb_delta = (int)tp->t_dblocks_delta; + msbp->msb_delta = tp->t_dblocks_delta; msbp++; } if (tp->t_agcount_delta != 0) { msbp->msb_field = XFS_SBS_AGCOUNT; - msbp->msb_delta = (int)tp->t_agcount_delta; + msbp->msb_delta = tp->t_agcount_delta; msbp++; } if (tp->t_imaxpct_delta != 0) { msbp->msb_field = XFS_SBS_IMAX_PCT; - msbp->msb_delta = (int)tp->t_imaxpct_delta; + msbp->msb_delta = tp->t_imaxpct_delta; msbp++; } if (tp->t_rextsize_delta != 0) { msbp->msb_field = XFS_SBS_REXTSIZE; - msbp->msb_delta = (int)tp->t_rextsize_delta; + msbp->msb_delta = tp->t_rextsize_delta; msbp++; } if (tp->t_rbmblocks_delta != 0) { msbp->msb_field = XFS_SBS_RBMBLOCKS; - msbp->msb_delta = (int)tp->t_rbmblocks_delta; + msbp->msb_delta = tp->t_rbmblocks_delta; msbp++; } if (tp->t_rblocks_delta != 0) { msbp->msb_field = XFS_SBS_RBLOCKS; - msbp->msb_delta = (int)tp->t_rblocks_delta; + msbp->msb_delta = tp->t_rblocks_delta; msbp++; } if (tp->t_rextents_delta != 0) { msbp->msb_field = XFS_SBS_REXTENTS; - msbp->msb_delta = (int)tp->t_rextents_delta; + msbp->msb_delta = tp->t_rextents_delta; msbp++; } if (tp->t_rextslog_delta != 0) { msbp->msb_field = XFS_SBS_REXTSLOG; - msbp->msb_delta = (int)tp->t_rextslog_delta; + msbp->msb_delta = tp->t_rextslog_delta; msbp++; } } @@ -753,7 +785,6 @@ int _xfs_trans_commit( xfs_trans_t *tp, uint flags, - xfs_lsn_t *commit_lsn_p, int *log_flushed) { xfs_log_iovec_t *log_vector; @@ -812,8 +843,6 @@ shut_us_down: xfs_trans_free_busy(tp); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); - if (commit_lsn_p) - *commit_lsn_p = commit_lsn; return (shutdown); } ASSERT(tp->t_ticket != NULL); @@ -861,12 +890,9 @@ shut_us_down: tp->t_commit_lsn = commit_lsn; if (nvec > XFS_TRANS_LOGVEC_COUNT) { - kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t)); + kmem_free(log_vector); } - if (commit_lsn_p) - *commit_lsn_p = commit_lsn; - /* * If we got a log write error. Unpin the logitems that we * had pinned, clean up, free trans structure, and return error. @@ -1144,7 +1170,7 @@ xfs_trans_cancel( while (licp != NULL) { lidp = licp->lic_descs; for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } @@ -1191,6 +1217,68 @@ xfs_trans_free( kmem_zone_free(xfs_trans_zone, tp); } +/* + * Roll from one trans in the sequence of PERMANENT transactions to + * the next: permanent transactions are only flushed out when + * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * as possible to let chunks of it go to the log. So we commit the + * chunk we've been working on and get a new transaction to continue. + */ +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + struct xfs_trans *trans; + unsigned int logres, count; + int error; + + /* + * Ensure that the inode is always logged. + */ + trans = *tpp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + + /* + * Copy the critical parameters from one trans to the next. + */ + logres = trans->t_log_res; + count = trans->t_log_count; + *tpp = xfs_trans_dup(trans); + + /* + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. + */ + error = xfs_trans_commit(trans, 0); + if (error) + return (error); + + trans = *tpp; + + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + error = xfs_trans_reserve(trans, 0, logres, 0, + XFS_TRANS_PERM_LOG_RES, count); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (error) + return error; + + xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(trans, dp); + return 0; +} /* * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). @@ -1228,7 +1316,7 @@ xfs_trans_committed( * Special case the chunk embedded in the transaction. */ licp = &(tp->t_items); - if (!(XFS_LIC_ARE_ALL_FREE(licp))) { + if (!(xfs_lic_are_all_free(licp))) { xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); } @@ -1237,10 +1325,10 @@ xfs_trans_committed( */ licp = licp->lic_next; while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); next_licp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); licp = next_licp; } @@ -1297,11 +1385,10 @@ xfs_trans_chunk_committed( xfs_lsn_t item_lsn; struct xfs_mount *mp; int i; - SPLDECL(s); lidp = licp->lic_descs; for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } @@ -1338,7 +1425,7 @@ xfs_trans_chunk_committed( * the test below. */ mp = lip->li_mountp; - AIL_LOCK(mp,s); + spin_lock(&mp->m_ail_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { /* * This will set the item's lsn to item_lsn @@ -1347,9 +1434,9 @@ xfs_trans_chunk_committed( * * xfs_trans_update_ail() drops the AIL lock. */ - xfs_trans_update_ail(mp, lip, item_lsn, s); + xfs_trans_update_ail(mp, lip, item_lsn); } else { - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9dc88b3..74c80bd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -39,13 +39,9 @@ typedef struct xfs_trans_header { /* * Log item types. */ -#define XFS_LI_5_3_BUF 0x1234 /* v1 bufs, 1-block inode buffers */ -#define XFS_LI_5_3_INODE 0x1235 /* 1-block inode buffers */ #define XFS_LI_EFI 0x1236 #define XFS_LI_EFD 0x1237 #define XFS_LI_IUNLINK 0x1238 -#define XFS_LI_6_1_INODE 0x1239 /* 4K non-aligned inode bufs */ -#define XFS_LI_6_1_BUF 0x123a /* v1, 4K inode buffers */ #define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ #define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ #define XFS_LI_DQUOT 0x123d @@ -98,7 +94,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFSRT_ZERO 38 #define XFS_TRANS_GROWFSRT_FREE 39 #define XFS_TRANS_SWAPEXT 40 -#define XFS_TRANS_TYPE_MAX 40 +#define XFS_TRANS_SB_COUNT 41 +#define XFS_TRANS_TYPE_MAX 41 /* new transaction types need to be reflected in xfs_logprint(8) */ @@ -116,13 +113,8 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot_acct; -typedef struct xfs_ail_entry { - struct xfs_log_item *ail_forw; /* AIL forw pointer */ - struct xfs_log_item *ail_back; /* AIL back pointer */ -} xfs_ail_entry_t; - typedef struct xfs_log_item { - xfs_ail_entry_t li_ail; /* AIL pointers */ + struct list_head li_ail; /* AIL pointers */ xfs_lsn_t li_lsn; /* last on-disk lsn */ struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ struct xfs_mount *li_mountp; /* ptr to fs mount */ @@ -149,7 +141,6 @@ typedef struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_push)(xfs_log_item_t *); - void (*iop_abort)(xfs_log_item_t *); void (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); } xfs_item_ops_t; @@ -163,7 +154,6 @@ typedef struct xfs_item_ops { #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) #define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) #define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) -#define IOP_ABORT(ip) (*(ip)->li_ops->iop_abort)(ip) #define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) #define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) @@ -220,62 +210,52 @@ typedef struct xfs_log_item_chunk { * lic_unused to the right value (0 matches all free). The * lic_descs.lid_index values are set up as each desc is allocated. */ -#define XFS_LIC_INIT(cp) xfs_lic_init(cp) static inline void xfs_lic_init(xfs_log_item_chunk_t *cp) { cp->lic_free = XFS_LIC_FREEMASK; } -#define XFS_LIC_INIT_SLOT(cp,slot) xfs_lic_init_slot(cp, slot) static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot) { cp->lic_descs[slot].lid_index = (unsigned char)(slot); } -#define XFS_LIC_VACANCY(cp) xfs_lic_vacancy(cp) static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp) { return cp->lic_free & XFS_LIC_FREEMASK; } -#define XFS_LIC_ALL_FREE(cp) xfs_lic_all_free(cp) static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp) { cp->lic_free = XFS_LIC_FREEMASK; } -#define XFS_LIC_ARE_ALL_FREE(cp) xfs_lic_are_all_free(cp) static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp) { return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK); } -#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot) static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot) { return (cp->lic_free & (1 << slot)); } -#define XFS_LIC_CLAIM(cp,slot) xfs_lic_claim(cp,slot) static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot) { cp->lic_free &= ~(1 << slot); } -#define XFS_LIC_RELSE(cp,slot) xfs_lic_relse(cp,slot) static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot) { cp->lic_free |= 1 << slot; } -#define XFS_LIC_SLOT(cp,slot) xfs_lic_slot(cp,slot) static inline xfs_log_item_desc_t * xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot) { return &(cp->lic_descs[slot]); } -#define XFS_LIC_DESC_TO_SLOT(dp) xfs_lic_desc_to_slot(dp) static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) { return (uint)dp->lid_index; @@ -288,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) * All of this yields the address of the chunk, which is * cast to a chunk pointer. */ -#define XFS_LIC_DESC_TO_CHUNK(dp) xfs_lic_desc_to_chunk(dp) static inline xfs_log_item_chunk_t * xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) { @@ -346,7 +325,6 @@ typedef struct xfs_trans { unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ xfs_log_ticket_t t_ticket; /* log mgr ticket */ - sema_t t_sema; /* sema for commit completion */ xfs_lsn_t t_lsn; /* log seq num of start of * transaction. */ xfs_lsn_t t_commit_lsn; /* log seq num of end of @@ -356,25 +334,25 @@ typedef struct xfs_trans { xfs_trans_callback_t t_callback; /* transaction callback */ void *t_callarg; /* callback arg */ unsigned int t_flags; /* misc flags */ - long t_icount_delta; /* superblock icount change */ - long t_ifree_delta; /* superblock ifree change */ - long t_fdblocks_delta; /* superblock fdblocks chg */ - long t_res_fdblocks_delta; /* on-disk only chg */ - long t_frextents_delta;/* superblock freextents chg*/ - long t_res_frextents_delta; /* on-disk only chg */ + int64_t t_icount_delta; /* superblock icount change */ + int64_t t_ifree_delta; /* superblock ifree change */ + int64_t t_fdblocks_delta; /* superblock fdblocks chg */ + int64_t t_res_fdblocks_delta; /* on-disk only chg */ + int64_t t_frextents_delta;/* superblock freextents chg*/ + int64_t t_res_frextents_delta; /* on-disk only chg */ #ifdef DEBUG - long t_ag_freeblks_delta; /* debugging counter */ - long t_ag_flist_delta; /* debugging counter */ - long t_ag_btree_delta; /* debugging counter */ + int64_t t_ag_freeblks_delta; /* debugging counter */ + int64_t t_ag_flist_delta; /* debugging counter */ + int64_t t_ag_btree_delta; /* debugging counter */ #endif - long t_dblocks_delta;/* superblock dblocks change */ - long t_agcount_delta;/* superblock agcount change */ - long t_imaxpct_delta;/* superblock imaxpct change */ - long t_rextsize_delta;/* superblock rextsize chg */ - long t_rbmblocks_delta;/* superblock rbmblocks chg */ - long t_rblocks_delta;/* superblock rblocks change */ - long t_rextents_delta;/* superblocks rextents chg */ - long t_rextslog_delta;/* superblocks rextslog chg */ + int64_t t_dblocks_delta;/* superblock dblocks change */ + int64_t t_agcount_delta;/* superblock agcount change */ + int64_t t_imaxpct_delta;/* superblock imaxpct change */ + int64_t t_rextsize_delta;/* superblock rextsize chg */ + int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ + int64_t t_rblocks_delta;/* superblock rblocks change */ + int64_t t_rextents_delta;/* superblocks rextents chg */ + int64_t t_rextslog_delta;/* superblocks rextslog chg */ unsigned int t_items_free; /* log item descs free */ xfs_log_item_chunk_t t_items; /* first log item desc chunk */ xfs_trans_header_t t_header; /* header for in-log trans */ @@ -938,9 +916,9 @@ typedef struct xfs_trans { #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) #ifdef DEBUG -#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d) -#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (long)d) -#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (long)d) +#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) +#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) +#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) #else #define xfs_trans_agblocks_delta(tp, d) #define xfs_trans_agflist_delta(tp, d) @@ -956,7 +934,7 @@ xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); -void xfs_trans_mod_sb(xfs_trans_t *, uint, long); +void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, int, uint); int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *, @@ -994,13 +972,13 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, xfs_extlen_t); int _xfs_trans_commit(xfs_trans_t *, uint flags, - xfs_lsn_t *, int *); -#define xfs_trans_commit(tp, flags, lsn) \ - _xfs_trans_commit(tp, flags, lsn, NULL) +#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) void xfs_trans_cancel(xfs_trans_t *, int); -void xfs_trans_ail_init(struct xfs_mount *); -xfs_lsn_t xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); +int xfs_trans_ail_init(struct xfs_mount *); +void xfs_trans_ail_destroy(struct xfs_mount *); +void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); xfs_lsn_t xfs_trans_tail_ail(struct xfs_mount *); void xfs_trans_unlocked_item(struct xfs_mount *, xfs_log_item_t *); @@ -1008,6 +986,8 @@ xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx); +extern kmem_zone_t *xfs_trans_zone; + #endif /* __KERNEL__ */ #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 558c87f..1f77c00 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -22,20 +22,21 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_error.h" -STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *); -STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *); +STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *); +STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *); #ifdef DEBUG -STATIC void xfs_ail_check(xfs_ail_entry_t *); +STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *); #else -#define xfs_ail_check(a) +#define xfs_ail_check(a,l) #endif /* DEBUG */ @@ -54,16 +55,15 @@ xfs_trans_tail_ail( { xfs_lsn_t lsn; xfs_log_item_t *lip; - SPLDECL(s); - AIL_LOCK(mp,s); - lip = xfs_ail_min(&(mp->m_ail)); + spin_lock(&mp->m_ail_lock); + lip = xfs_ail_min(&mp->m_ail); if (lip == NULL) { lsn = (xfs_lsn_t)0; } else { lsn = lip->li_lsn; } - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); return lsn; } @@ -71,120 +71,187 @@ xfs_trans_tail_ail( /* * xfs_trans_push_ail * - * This routine is called to move the tail of the AIL - * forward. It does this by trying to flush items in the AIL - * whose lsns are below the given threshold_lsn. + * This routine is called to move the tail of the AIL forward. It does this by + * trying to flush items in the AIL whose lsns are below the given + * threshold_lsn. * - * The routine returns the lsn of the tail of the log. + * the push is run asynchronously in a separate thread, so we return the tail + * of the log right now instead of the tail after the push. This means we will + * either continue right away, or we will sleep waiting on the async thread to + * do it's work. + * + * We do this unlocked - we only need to know whether there is anything in the + * AIL at the time we are called. We don't need to access the contents of + * any of the objects, so the lock is not needed. */ -xfs_lsn_t +void xfs_trans_push_ail( xfs_mount_t *mp, xfs_lsn_t threshold_lsn) { - xfs_lsn_t lsn; xfs_log_item_t *lip; - int gen; - int restarts; - int lock_result; - int flush_log; - SPLDECL(s); + + lip = xfs_ail_min(&mp->m_ail); + if (lip && !XFS_FORCED_SHUTDOWN(mp)) { + if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0) + xfsaild_wakeup(mp, threshold_lsn); + } +} + +/* + * Return the item in the AIL with the current lsn. + * Return the current tree generation number for use + * in calls to xfs_trans_next_ail(). + */ +STATIC xfs_log_item_t * +xfs_trans_first_push_ail( + xfs_mount_t *mp, + int *gen, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + lip = xfs_ail_min(&mp->m_ail); + *gen = (int)mp->m_ail.xa_gen; + if (lsn == 0) + return lip; + + list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) + return lip; + } + + return NULL; +} + +/* + * Function that does the work of pushing on the AIL + */ +long +xfsaild_push( + xfs_mount_t *mp, + xfs_lsn_t *last_lsn) +{ + long tout = 1000; /* milliseconds */ + xfs_lsn_t last_pushed_lsn = *last_lsn; + xfs_lsn_t target = mp->m_ail.xa_target; + xfs_lsn_t lsn; + xfs_log_item_t *lip; + int gen; + int restarts; + int flush_log, count, stuck; #define XFS_TRANS_PUSH_AIL_RESTARTS 10 - AIL_LOCK(mp,s); - lip = xfs_trans_first_ail(mp, &gen); - if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) { + spin_lock(&mp->m_ail_lock); + lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn); + if (!lip || XFS_FORCED_SHUTDOWN(mp)) { /* - * Just return if the AIL is empty. + * AIL is empty or our push has reached the end. */ - AIL_UNLOCK(mp, s); - return (xfs_lsn_t)0; + spin_unlock(&mp->m_ail_lock); + last_pushed_lsn = 0; + goto out; } XFS_STATS_INC(xs_push_ail); /* * While the item we are looking at is below the given threshold - * try to flush it out. Make sure to limit the number of times - * we allow xfs_trans_next_ail() to restart scanning from the - * beginning of the list. We'd like not to stop until we've at least + * try to flush it out. We'd like not to stop until we've at least * tried to push on everything in the AIL with an LSN less than - * the given threshold. However, we may give up before that if - * we realize that we've been holding the AIL_LOCK for 'too long', - * blocking interrupts. Currently, too long is < 500us roughly. + * the given threshold. + * + * However, we will stop after a certain number of pushes and wait + * for a reduced timeout to fire before pushing further. This + * prevents use from spinning when we can't do anything or there is + * lots of contention on the AIL lists. */ - flush_log = 0; - restarts = 0; - while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) && - (XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) { + tout = 10; + lsn = lip->li_lsn; + flush_log = stuck = count = restarts = 0; + while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { + int lock_result; /* - * If we can lock the item without sleeping, unlock - * the AIL lock and flush the item. Then re-grab the - * AIL lock so we can look for the next item on the - * AIL. Since we unlock the AIL while we flush the - * item, the next routine may start over again at the - * the beginning of the list if anything has changed. - * That is what the generation count is for. + * If we can lock the item without sleeping, unlock the AIL + * lock and flush the item. Then re-grab the AIL lock so we + * can look for the next item on the AIL. List changes are + * handled by the AIL lookup functions internally * - * If we can't lock the item, either its holder will flush - * it or it is already being flushed or it is being relogged. - * In any of these case it is being taken care of and we - * can just skip to the next item in the list. + * If we can't lock the item, either its holder will flush it + * or it is already being flushed or it is being relogged. In + * any of these case it is being taken care of and we can just + * skip to the next item in the list. */ lock_result = IOP_TRYLOCK(lip); + spin_unlock(&mp->m_ail_lock); switch (lock_result) { - case XFS_ITEM_SUCCESS: - AIL_UNLOCK(mp, s); + case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); IOP_PUSH(lip); - AIL_LOCK(mp,s); + last_pushed_lsn = lsn; break; - case XFS_ITEM_PUSHBUF: - AIL_UNLOCK(mp, s); + case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); -#ifdef XFSRACEDEBUG - delay_for_intr(); - delay(300); -#endif - ASSERT(lip->li_ops->iop_pushbuf); - ASSERT(lip); IOP_PUSHBUF(lip); - AIL_LOCK(mp,s); + last_pushed_lsn = lsn; break; - case XFS_ITEM_PINNED: + case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); + stuck++; flush_log = 1; break; - case XFS_ITEM_LOCKED: + case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); + last_pushed_lsn = lsn; + stuck++; break; - case XFS_ITEM_FLUSHING: + case XFS_ITEM_FLUSHING: XFS_STATS_INC(xs_push_ail_flushing); + last_pushed_lsn = lsn; + stuck++; break; - default: + default: ASSERT(0); break; } - lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); - if (lip == NULL) { + spin_lock(&mp->m_ail_lock); + /* should we bother continuing? */ + if (XFS_FORCED_SHUTDOWN(mp)) + break; + ASSERT(mp->m_log); + + count++; + + /* + * Are there too many items we can't do anything with? + * If we we are skipping too many items because we can't flush + * them or they are already being flushed, we back off and + * given them time to complete whatever operation is being + * done. i.e. remove pressure from the AIL while we can't make + * progress so traversals don't slow down further inserts and + * removals to/from the AIL. + * + * The value of 100 is an arbitrary magic number based on + * observation. + */ + if (stuck > 100) break; - } - if (XFS_FORCED_SHUTDOWN(mp)) { - /* - * Just return if we shut down during the last try. - */ - AIL_UNLOCK(mp, s); - return (xfs_lsn_t)0; - } + lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); + if (lip == NULL) + break; + if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS) + break; + lsn = lip->li_lsn; } + spin_unlock(&mp->m_ail_lock); if (flush_log) { /* @@ -192,22 +259,38 @@ xfs_trans_push_ail( * push out the log so it will become unpinned and * move forward in the AIL. */ - AIL_UNLOCK(mp, s); XFS_STATS_INC(xs_push_ail_flush); xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - AIL_LOCK(mp, s); } - lip = xfs_ail_min(&(mp->m_ail)); - if (lip == NULL) { - lsn = (xfs_lsn_t)0; - } else { - lsn = lip->li_lsn; + if (!count) { + /* We're past our target or empty, so idle */ + tout = 1000; + } else if (XFS_LSN_CMP(lsn, target) >= 0) { + /* + * We reached the target so wait a bit longer for I/O to + * complete and remove pushed items from the AIL before we + * start the next scan from the start of the AIL. + */ + tout += 20; + last_pushed_lsn = 0; + } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) || + ((stuck * 100) / count > 90)) { + /* + * Either there is a lot of contention on the AIL or we + * are stuck due to operations in progress. "Stuck" in this + * case is defined as >90% of the items we tried to push + * were stuck. + * + * Backoff a bit more to allow some I/O to complete before + * continuing from where we were. + */ + tout += 10; } - - AIL_UNLOCK(mp, s); - return lsn; -} /* xfs_trans_push_ail */ +out: + *last_lsn = last_pushed_lsn; + return tout; +} /* xfsaild_push */ /* @@ -268,25 +351,21 @@ xfs_trans_unlocked_item( * has changed. * * This function must be called with the AIL lock held. The lock - * is dropped before returning, so the caller must pass in the - * cookie returned by AIL_LOCK. + * is dropped before returning. */ void xfs_trans_update_ail( xfs_mount_t *mp, xfs_log_item_t *lip, - xfs_lsn_t lsn, - unsigned long s) + xfs_lsn_t lsn) __releases(mp->m_ail_lock) { - xfs_ail_entry_t *ailp; xfs_log_item_t *dlip=NULL; xfs_log_item_t *mlip; /* ptr to minimum lip */ - ailp = &(mp->m_ail); - mlip = xfs_ail_min(ailp); + mlip = xfs_ail_min(&mp->m_ail); if (lip->li_flags & XFS_LI_IN_AIL) { - dlip = xfs_ail_delete(ailp, lip); + dlip = xfs_ail_delete(&mp->m_ail, lip); ASSERT(dlip == lip); } else { lip->li_flags |= XFS_LI_IN_AIL; @@ -294,15 +373,15 @@ xfs_trans_update_ail( lip->li_lsn = lsn; - xfs_ail_insert(ailp, lip); - mp->m_ail_gen++; + xfs_ail_insert(&mp->m_ail, lip); + mp->m_ail.xa_gen++; if (mlip == dlip) { - mlip = xfs_ail_min(&(mp->m_ail)); - AIL_UNLOCK(mp, s); + mlip = xfs_ail_min(&mp->m_ail); + spin_unlock(&mp->m_ail_lock); xfs_log_move_tail(mp, mlip->li_lsn); } else { - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); } @@ -321,36 +400,32 @@ xfs_trans_update_ail( * has changed. * * This function must be called with the AIL lock held. The lock - * is dropped before returning, so the caller must pass in the - * cookie returned by AIL_LOCK. + * is dropped before returning. */ void xfs_trans_delete_ail( xfs_mount_t *mp, - xfs_log_item_t *lip, - unsigned long s) + xfs_log_item_t *lip) __releases(mp->m_ail_lock) { - xfs_ail_entry_t *ailp; xfs_log_item_t *dlip; xfs_log_item_t *mlip; if (lip->li_flags & XFS_LI_IN_AIL) { - ailp = &(mp->m_ail); - mlip = xfs_ail_min(ailp); - dlip = xfs_ail_delete(ailp, lip); + mlip = xfs_ail_min(&mp->m_ail); + dlip = xfs_ail_delete(&mp->m_ail, lip); ASSERT(dlip == lip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; - mp->m_ail_gen++; + mp->m_ail.xa_gen++; if (mlip == dlip) { - mlip = xfs_ail_min(&(mp->m_ail)); - AIL_UNLOCK(mp, s); + mlip = xfs_ail_min(&mp->m_ail); + spin_unlock(&mp->m_ail_lock); xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0)); } else { - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); } } else { @@ -359,12 +434,12 @@ xfs_trans_delete_ail( * serious trouble if we get to this stage. */ if (XFS_FORCED_SHUTDOWN(mp)) - AIL_UNLOCK(mp, s); + spin_unlock(&mp->m_ail_lock); else { xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, "%s: attempting to delete a log item that is not in the AIL", - __FUNCTION__); - AIL_UNLOCK(mp, s); + __func__); + spin_unlock(&mp->m_ail_lock); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } } @@ -384,10 +459,10 @@ xfs_trans_first_ail( { xfs_log_item_t *lip; - lip = xfs_ail_min(&(mp->m_ail)); - *gen = (int)mp->m_ail_gen; + lip = xfs_ail_min(&mp->m_ail); + *gen = (int)mp->m_ail.xa_gen; - return (lip); + return lip; } /* @@ -407,11 +482,11 @@ xfs_trans_next_ail( xfs_log_item_t *nlip; ASSERT(mp && lip && gen); - if (mp->m_ail_gen == *gen) { - nlip = xfs_ail_next(&(mp->m_ail), lip); + if (mp->m_ail.xa_gen == *gen) { + nlip = xfs_ail_next(&mp->m_ail, lip); } else { - nlip = xfs_ail_min(&(mp->m_ail)); - *gen = (int)mp->m_ail_gen; + nlip = xfs_ail_min(&mp->m_ail); + *gen = (int)mp->m_ail.xa_gen; if (restarts != NULL) { XFS_STATS_INC(xs_push_ail_restarts); (*restarts)++; @@ -436,12 +511,19 @@ xfs_trans_next_ail( /* * Initialize the doubly linked list to point only to itself. */ -void +int xfs_trans_ail_init( xfs_mount_t *mp) { - mp->m_ail.ail_forw = (xfs_log_item_t*)&(mp->m_ail); - mp->m_ail.ail_back = (xfs_log_item_t*)&(mp->m_ail); + INIT_LIST_HEAD(&mp->m_ail.xa_ail); + return xfsaild_start(mp); +} + +void +xfs_trans_ail_destroy( + xfs_mount_t *mp) +{ + xfsaild_stop(mp); } /* @@ -452,7 +534,7 @@ xfs_trans_ail_init( */ STATIC void xfs_ail_insert( - xfs_ail_entry_t *base, + xfs_ail_t *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { @@ -461,27 +543,22 @@ xfs_ail_insert( /* * If the list is empty, just insert the item. */ - if (base->ail_back == (xfs_log_item_t*)base) { - base->ail_forw = lip; - base->ail_back = lip; - lip->li_ail.ail_forw = (xfs_log_item_t*)base; - lip->li_ail.ail_back = (xfs_log_item_t*)base; + if (list_empty(&ailp->xa_ail)) { + list_add(&lip->li_ail, &ailp->xa_ail); return; } - next_lip = base->ail_back; - while ((next_lip != (xfs_log_item_t*)base) && - (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) { - next_lip = next_lip->li_ail.ail_back; + list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) + break; } - ASSERT((next_lip == (xfs_log_item_t*)base) || + + ASSERT((&next_lip->li_ail == &ailp->xa_ail) || (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); - lip->li_ail.ail_forw = next_lip->li_ail.ail_forw; - lip->li_ail.ail_back = next_lip; - next_lip->li_ail.ail_forw = lip; - lip->li_ail.ail_forw->li_ail.ail_back = lip; - xfs_ail_check(base); + list_add(&lip->li_ail, &next_lip->li_ail); + + xfs_ail_check(ailp, lip); return; } @@ -491,16 +568,14 @@ xfs_ail_insert( /*ARGSUSED*/ STATIC xfs_log_item_t * xfs_ail_delete( - xfs_ail_entry_t *base, + xfs_ail_t *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { - lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back; - lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw; - lip->li_ail.ail_forw = NULL; - lip->li_ail.ail_back = NULL; + xfs_ail_check(ailp, lip); + + list_del(&lip->li_ail); - xfs_ail_check(base); return lip; } @@ -510,14 +585,13 @@ xfs_ail_delete( */ STATIC xfs_log_item_t * xfs_ail_min( - xfs_ail_entry_t *base) + xfs_ail_t *ailp) /* ARGSUSED */ { - register xfs_log_item_t *forw = base->ail_forw; - if (forw == (xfs_log_item_t*)base) { + if (list_empty(&ailp->xa_ail)) return NULL; - } - return forw; + + return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); } /* @@ -527,15 +601,14 @@ xfs_ail_min( */ STATIC xfs_log_item_t * xfs_ail_next( - xfs_ail_entry_t *base, + xfs_ail_t *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { - if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) { + if (lip->li_ail.next == &ailp->xa_ail) return NULL; - } - return lip->li_ail.ail_forw; + return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); } #ifdef DEBUG @@ -544,38 +617,40 @@ xfs_ail_next( */ STATIC void xfs_ail_check( - xfs_ail_entry_t *base) + xfs_ail_t *ailp, + xfs_log_item_t *lip) { - xfs_log_item_t *lip; xfs_log_item_t *prev_lip; - lip = base->ail_forw; - if (lip == (xfs_log_item_t*)base) { - /* - * Make sure the pointers are correct when the list - * is empty. - */ - ASSERT(base->ail_back == (xfs_log_item_t*)base); + if (list_empty(&ailp->xa_ail)) return; - } /* - * Walk the list checking forward and backward pointers, - * lsn ordering, and that every entry has the XFS_LI_IN_AIL - * flag set. + * Check the next and previous entries are valid. */ - prev_lip = (xfs_log_item_t*)base; - while (lip != (xfs_log_item_t*)base) { - if (prev_lip != (xfs_log_item_t*)base) { - ASSERT(prev_lip->li_ail.ail_forw == lip); + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + + prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + + +#ifdef XFS_TRANS_DEBUG + /* + * Walk the list checking lsn ordering, and that every entry has the + * XFS_LI_IN_AIL flag set. This is really expensive, so only do it + * when specifically debugging the transaction subsystem. + */ + prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (&prev_lip->li_ail != &ailp->xa_ail) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - } - ASSERT(lip->li_ail.ail_back == prev_lip); ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); prev_lip = lip; - lip = lip->li_ail.ail_forw; } - ASSERT(lip == (xfs_log_item_t*)base); - ASSERT(base->ail_back == prev_lip); +#endif /* XFS_TRANS_DEBUG */ } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 60b6b89..4e855b5 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -304,7 +304,8 @@ xfs_trans_read_buf( if (tp == NULL) { bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); if (!bp) - return XFS_ERROR(ENOMEM); + return (flags & XFS_BUF_TRYLOCK) ? + EAGAIN : XFS_ERROR(ENOMEM); if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { xfs_ioerror_alert("xfs_trans_read_buf", mp, @@ -353,17 +354,15 @@ xfs_trans_read_buf( ASSERT(!XFS_BUF_ISASYNC(bp)); XFS_BUF_READ(bp); xfsbdstrat(tp->t_mountp, bp); - xfs_iowait(bp); - if (XFS_BUF_GETERROR(bp) != 0) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); /* - * We can gracefully recover from most - * read errors. Ones we can't are those - * that happen after the transaction's - * already dirty. + * We can gracefully recover from most read + * errors. Ones we can't are those that happen + * after the transaction's already dirty. */ if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, @@ -1022,16 +1021,16 @@ xfs_trans_buf_item_match( bp = NULL; len = BBTOB(len); licp = &tp->t_items; - if (!XFS_LIC_ARE_ALL_FREE(licp)) { + if (!xfs_lic_are_all_free(licp)) { for (i = 0; i < licp->lic_unused; i++) { /* * Skip unoccupied slots. */ - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); blip = (xfs_buf_log_item_t *)lidp->lid_item; if (blip->bli_item.li_type != XFS_LI_BUF) { continue; @@ -1075,7 +1074,7 @@ xfs_trans_buf_item_match_all( bp = NULL; len = BBTOB(len); for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { ASSERT(licp == &tp->t_items); ASSERT(licp->lic_next == NULL); return NULL; @@ -1084,11 +1083,11 @@ xfs_trans_buf_item_match_all( /* * Skip unoccupied slots. */ - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); blip = (xfs_buf_log_item_t *)lidp->lid_item; if (blip->bli_item.li_type != XFS_LI_BUF) { continue; diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index b290270..27cce2a 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -22,6 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index b8db1d5..2a1c0f0 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -111,13 +111,13 @@ xfs_trans_iget( */ ASSERT(ip->i_itemp != NULL); ASSERT(lock_flags & XFS_ILOCK_EXCL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - ismrlocked(&ip->i_iolock, MR_UPDATE)); + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)); ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS))); + xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY)); @@ -185,7 +185,7 @@ xfs_trans_ijoin( xfs_inode_log_item_t *iip; ASSERT(ip->i_transp == NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(lock_flags & XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); @@ -232,7 +232,7 @@ xfs_trans_ihold( { ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ip->i_itemp->ili_flags |= XFS_ILI_HOLD; } @@ -257,7 +257,7 @@ xfs_trans_log_inode( ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp)); ASSERT(lidp != NULL); @@ -291,7 +291,7 @@ xfs_trans_inode_broot_debug( iip = ip->i_itemp; if (iip->ili_root_size != 0) { ASSERT(iip->ili_orig_root != NULL); - kmem_free(iip->ili_orig_root, iip->ili_root_size); + kmem_free(iip->ili_orig_root); iip->ili_root_size = 0; iip->ili_orig_root = NULL; } diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index 2912aac..3c666e8 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -21,6 +21,7 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *, int, int, xfs_lsn_t); @@ -52,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) * Initialize the chunk, and then * claim the first slot in the newly allocated chunk. */ - XFS_LIC_INIT(licp); - XFS_LIC_CLAIM(licp, 0); + xfs_lic_init(licp); + xfs_lic_claim(licp, 0); licp->lic_unused = 1; - XFS_LIC_INIT_SLOT(licp, 0); - lidp = XFS_LIC_SLOT(licp, 0); + xfs_lic_init_slot(licp, 0); + lidp = xfs_lic_slot(licp, 0); /* * Link in the new chunk and update the free count. @@ -87,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) */ licp = &tp->t_items; while (licp != NULL) { - if (XFS_LIC_VACANCY(licp)) { + if (xfs_lic_vacancy(licp)) { if (licp->lic_unused <= XFS_LIC_MAX_SLOT) { i = licp->lic_unused; - ASSERT(XFS_LIC_ISFREE(licp, i)); + ASSERT(xfs_lic_isfree(licp, i)); break; } for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) { - if (XFS_LIC_ISFREE(licp, i)) + if (xfs_lic_isfree(licp, i)) break; } ASSERT(i <= XFS_LIC_MAX_SLOT); @@ -107,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) * If we find a free descriptor, claim it, * initialize it, and return it. */ - XFS_LIC_CLAIM(licp, i); + xfs_lic_claim(licp, i); if (licp->lic_unused <= i) { licp->lic_unused = i + 1; - XFS_LIC_INIT_SLOT(licp, i); + xfs_lic_init_slot(licp, i); } - lidp = XFS_LIC_SLOT(licp, i); + lidp = xfs_lic_slot(licp, i); tp->t_items_free--; lidp->lid_item = lip; lidp->lid_flags = 0; @@ -135,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) xfs_log_item_chunk_t *licp; xfs_log_item_chunk_t **licpp; - slot = XFS_LIC_DESC_TO_SLOT(lidp); - licp = XFS_LIC_DESC_TO_CHUNK(lidp); - XFS_LIC_RELSE(licp, slot); + slot = xfs_lic_desc_to_slot(lidp); + licp = xfs_lic_desc_to_chunk(lidp); + xfs_lic_relse(licp, slot); lidp->lid_item->li_desc = NULL; tp->t_items_free++; @@ -153,14 +154,14 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) * Also decrement the transaction structure's count of free items * by the number in a chunk since we are freeing an empty chunk. */ - if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) { + if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) { licpp = &(tp->t_items.lic_next); while (*licpp != licp) { ASSERT(*licpp != NULL); licpp = &((*licpp)->lic_next); } *licpp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); tp->t_items_free -= XFS_LIC_NUM_SLOTS; } } @@ -206,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp) /* * If it's not in the first chunk, skip to the second. */ - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { licp = licp->lic_next; } /* * Return the first non-free descriptor in the chunk. */ - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); for (i = 0; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - return XFS_LIC_SLOT(licp, i); + return xfs_lic_slot(licp, i); } cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); return NULL; @@ -241,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) xfs_log_item_chunk_t *licp; int i; - licp = XFS_LIC_DESC_TO_CHUNK(lidp); + licp = xfs_lic_desc_to_chunk(lidp); /* * First search the rest of the chunk. The for loop keeps us * from referencing things beyond the end of the chunk. */ - for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { + for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) { + if (xfs_lic_isfree(licp, i)) { continue; } - return XFS_LIC_SLOT(licp, i); + return xfs_lic_slot(licp, i); } /* @@ -265,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) } licp = licp->lic_next; - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); for (i = 0; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } - return XFS_LIC_SLOT(licp, i); + return xfs_lic_slot(licp, i); } ASSERT(0); /* NOTREACHED */ @@ -299,9 +300,9 @@ xfs_trans_free_items( /* * Special case the embedded chunk so we don't free it below. */ - if (!XFS_LIC_ARE_ALL_FREE(licp)) { + if (!xfs_lic_are_all_free(licp)) { (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); - XFS_LIC_ALL_FREE(licp); + xfs_lic_all_free(licp); licp->lic_unused = 0; } licp = licp->lic_next; @@ -310,10 +311,10 @@ xfs_trans_free_items( * Unlock each item in each chunk and free the chunks. */ while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); next_licp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); licp = next_licp; } @@ -346,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) /* * Special case the embedded chunk so we don't free. */ - if (!XFS_LIC_ARE_ALL_FREE(licp)) { + if (!xfs_lic_are_all_free(licp)) { freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); } licpp = &(tp->t_items.lic_next); @@ -357,12 +358,12 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) * and free empty chunks. */ while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + ASSERT(!xfs_lic_are_all_free(licp)); freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); next_licp = licp->lic_next; - if (XFS_LIC_ARE_ALL_FREE(licp)) { + if (xfs_lic_are_all_free(licp)) { *licpp = next_licp; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + kmem_free(licp); freed -= XFS_LIC_NUM_SLOTS; } else { licpp = &(licp->lic_next); @@ -401,7 +402,7 @@ xfs_trans_unlock_chunk( freed = 0; lidp = licp->lic_descs; for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { + if (xfs_lic_isfree(licp, i)) { continue; } lip = lidp->lid_item; @@ -420,7 +421,7 @@ xfs_trans_unlock_chunk( */ if (!(freeing_chunk) && (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) { - XFS_LIC_RELSE(licp, i); + xfs_lic_relse(licp, i); freed++; } } @@ -529,7 +530,7 @@ xfs_trans_free_busy(xfs_trans_t *tp) lbcp = tp->t_busy.lbc_next; while (lbcp != NULL) { lbcq = lbcp->lbc_next; - kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t)); + kmem_free(lbcp); lbcp = lbcq; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 13edab8..3c748c4 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -46,14 +46,23 @@ xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, /* * From xfs_trans_ail.c */ -void xfs_trans_update_ail(struct xfs_mount *, - struct xfs_log_item *, xfs_lsn_t, - unsigned long); -void xfs_trans_delete_ail(struct xfs_mount *, - struct xfs_log_item *, unsigned long); +void xfs_trans_update_ail(struct xfs_mount *mp, + struct xfs_log_item *lip, xfs_lsn_t lsn) + __releases(mp->m_ail_lock); +void xfs_trans_delete_ail(struct xfs_mount *mp, + struct xfs_log_item *lip) + __releases(mp->m_ail_lock); struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *); struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *, struct xfs_log_item *, int *, int *); +/* + * AIL push thread support + */ +long xfsaild_push(struct xfs_mount *, xfs_lsn_t *); +void xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t); +int xfsaild_start(struct xfs_mount *); +void xfsaild_stop(struct xfs_mount *); + #endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 104f64a..0f51916 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -151,18 +151,6 @@ typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */ */ #define MAXNAMELEN 256 -typedef struct xfs_dirent { /* data from readdir() */ - xfs_ino_t d_ino; /* inode number of entry */ - xfs_off_t d_off; /* offset of disk directory entry */ - unsigned short d_reclen; /* length of this record */ - char d_name[1]; /* name of file */ -} xfs_dirent_t; - -#define DIRENTBASESIZE (((xfs_dirent_t *)0)->d_name - (char *)0) -#define DIRENTSIZE(namelen) \ - ((DIRENTBASESIZE + (namelen) + \ - sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1)) - typedef enum { XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi } xfs_lookup_t; @@ -172,4 +160,9 @@ typedef enum { XFS_BTNUM_MAX } xfs_btnum_t; +struct xfs_name { + const char *name; + int len; +}; + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 9014d7e..35d4d41 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -40,76 +40,6 @@ #include "xfs_itable.h" #include "xfs_utils.h" -/* - * xfs_get_dir_entry is used to get a reference to an inode given - * its parent directory inode and the name of the file. It does - * not lock the child inode, and it unlocks the directory before - * returning. The directory's generation number is returned for - * use by a later call to xfs_lock_dir_and_entry. - */ -int -xfs_get_dir_entry( - bhv_vname_t *dentry, - xfs_inode_t **ipp) -{ - bhv_vnode_t *vp; - - vp = VNAME_TO_VNODE(dentry); - - *ipp = xfs_vtoi(vp); - if (!*ipp) - return XFS_ERROR(ENOENT); - VN_HOLD(vp); - return 0; -} - -int -xfs_dir_lookup_int( - bhv_desc_t *dir_bdp, - uint lock_mode, - bhv_vname_t *dentry, - xfs_ino_t *inum, - xfs_inode_t **ipp) -{ - bhv_vnode_t *dir_vp; - xfs_inode_t *dp; - int error; - - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - dp = XFS_BHVTOI(dir_bdp); - - error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum); - if (!error) { - /* - * Unlock the directory. We do this because we can't - * hold the directory lock while doing the vn_get() - * in xfs_iget(). Doing so could cause us to hold - * a lock while waiting for the inode to finish - * being inactive while it's waiting for a log - * reservation in the inactive routine. - */ - xfs_iunlock(dp, lock_mode); - error = xfs_iget(dp->i_mount, NULL, *inum, 0, 0, ipp, 0); - xfs_ilock(dp, lock_mode); - - if (error) { - *ipp = NULL; - } else if ((*ipp)->i_d.di_mode == 0) { - /* - * The inode has been freed. Something is - * wrong so just get out of here. - */ - xfs_iunlock(dp, lock_mode); - xfs_iput_new(*ipp, 0); - *ipp = NULL; - xfs_ilock(dp, lock_mode); - error = XFS_ERROR(ENOENT); - } - } - return error; -} /* * Allocates a new inode from disk and return a pointer to the @@ -222,7 +152,7 @@ xfs_dir_ialloc( } ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0, NULL); + code = xfs_trans_commit(tp, 0); tp = ntp; if (committed != NULL) { *committed = 1; @@ -307,6 +237,7 @@ xfs_droplink( ASSERT (ip->i_d.di_nlink > 0); ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = 0; @@ -335,23 +266,22 @@ xfs_bump_ino_vers2( xfs_inode_t *ip) { xfs_mount_t *mp; - unsigned long s; - ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1); ip->i_d.di_version = XFS_DINODE_VERSION_2; ip->i_d.di_onlink = 0; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); mp = tp->t_mountp; - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { - s = XFS_SB_LOCK(mp); - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { - XFS_SB_VERSION_ADDNLINK(&mp->m_sb); - XFS_SB_UNLOCK(mp, s); + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + xfs_sb_version_addnlink(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, XFS_SB_VERSIONNUM); } else { - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); } } /* Caller must log the inode */ @@ -371,6 +301,7 @@ xfs_bumplink( ASSERT(ip->i_d.di_nlink > 0); ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) && (ip->i_d.di_nlink > XFS_MAXLINK_1)) { /* @@ -420,7 +351,11 @@ xfs_truncate_file( * in a transaction. */ xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0); + error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, @@ -460,8 +395,7 @@ xfs_truncate_file( XFS_TRANS_ABORT); } else { xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, - NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); } xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index fe953e9..ef32122 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -18,22 +18,12 @@ #ifndef __XFS_UTILS_H__ #define __XFS_UTILS_H__ -#define IRELE(ip) VN_RELE(XFS_ITOV(ip)) -#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip)) -#define ITRACE(ip) vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \ - (inst_t *)__return_address) - -extern int xfs_rename (bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *, - bhv_vname_t *, cred_t *); -extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **); -extern int xfs_dir_lookup_int (bhv_desc_t *, uint, bhv_vname_t *, xfs_ino_t *, - xfs_inode_t **); -extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *); -extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, +extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *); +extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, cred_t *, prid_t, int, xfs_inode_t **, int *); -extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *); -extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *); -extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *); +extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); +extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); +extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); #endif /* __XFS_UTILS_H__ */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index a34796e..439dd39 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -43,7 +43,6 @@ #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_rw.h" -#include "xfs_refcache.h" #include "xfs_buf_item.h" #include "xfs_log_priv.h" #include "xfs_dir2_trace.h" @@ -51,584 +50,20 @@ #include "xfs_acl.h" #include "xfs_attr.h" #include "xfs_clnt.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" #include "xfs_fsops.h" +#include "xfs_vnodeops.h" +#include "xfs_vfsops.h" +#include "xfs_utils.h" -STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); -int -xfs_init(void) -{ - extern kmem_zone_t *xfs_bmap_free_item_zone; - extern kmem_zone_t *xfs_btree_cur_zone; - extern kmem_zone_t *xfs_trans_zone; - extern kmem_zone_t *xfs_buf_item_zone; - extern kmem_zone_t *xfs_dabuf_zone; -#ifdef XFS_DABUF_DEBUG - extern lock_t xfs_dabuf_global_lock; - spinlock_init(&xfs_dabuf_global_lock, "xfsda"); -#endif - - /* - * Initialize all of the zone allocators we use. - */ - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); - xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), - "xfs_btree_cur"); - xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); - xfs_da_state_zone = - kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state"); - xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); - xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); - - /* - * The size of the zone allocated buf log item is the maximum - * size possible under XFS. This wastes a little bit of memory, - * but it is much faster. - */ - xfs_buf_item_zone = - kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / - NBWORD) * sizeof(int))), - "xfs_buf_item"); - xfs_efd_zone = - kmem_zone_init((sizeof(xfs_efd_log_item_t) + - ((XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), - "xfs_efd_item"); - xfs_efi_zone = - kmem_zone_init((sizeof(xfs_efi_log_item_t) + - ((XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), - "xfs_efi_item"); - - /* - * These zones warrant special memory allocator hints - */ - xfs_inode_zone = - kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, NULL); - xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); - xfs_chashlist_zone = - kmem_zone_init_flags(sizeof(xfs_chashlist_t), "xfs_chashlist", - KM_ZONE_SPREAD, NULL); - - /* - * Allocate global trace buffers. - */ -#ifdef XFS_ALLOC_TRACE - xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_BMAP_TRACE - xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_BMBT_TRACE - xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_ATTR_TRACE - xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_DIR2_TRACE - xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP); -#endif - - xfs_dir_startup(); - -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) - xfs_error_test_init(); -#endif /* DEBUG || INDUCE_IO_ERROR */ - - xfs_init_procfs(); - xfs_sysctl_register(); - return 0; -} - -void -xfs_cleanup(void) -{ - extern kmem_zone_t *xfs_bmap_free_item_zone; - extern kmem_zone_t *xfs_btree_cur_zone; - extern kmem_zone_t *xfs_inode_zone; - extern kmem_zone_t *xfs_trans_zone; - extern kmem_zone_t *xfs_da_state_zone; - extern kmem_zone_t *xfs_dabuf_zone; - extern kmem_zone_t *xfs_efd_zone; - extern kmem_zone_t *xfs_efi_zone; - extern kmem_zone_t *xfs_buf_item_zone; - extern kmem_zone_t *xfs_chashlist_zone; - - xfs_cleanup_procfs(); - xfs_sysctl_unregister(); - xfs_refcache_destroy(); - xfs_acl_zone_destroy(xfs_acl_zone); - -#ifdef XFS_DIR2_TRACE - ktrace_free(xfs_dir2_trace_buf); -#endif -#ifdef XFS_ATTR_TRACE - ktrace_free(xfs_attr_trace_buf); -#endif -#ifdef XFS_BMBT_TRACE - ktrace_free(xfs_bmbt_trace_buf); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(xfs_bmap_trace_buf); -#endif -#ifdef XFS_ALLOC_TRACE - ktrace_free(xfs_alloc_trace_buf); -#endif - - kmem_zone_destroy(xfs_bmap_free_item_zone); - kmem_zone_destroy(xfs_btree_cur_zone); - kmem_zone_destroy(xfs_inode_zone); - kmem_zone_destroy(xfs_trans_zone); - kmem_zone_destroy(xfs_da_state_zone); - kmem_zone_destroy(xfs_dabuf_zone); - kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_efd_zone); - kmem_zone_destroy(xfs_efi_zone); - kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_chashlist_zone); -} - -/* - * xfs_start_flags - * - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock has _not_ yet been read in. - */ -STATIC int -xfs_start_flags( - struct bhv_vfs *vfs, - struct xfs_mount_args *ap, - struct xfs_mount *mp) -{ - /* Values are in BBs */ - if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { - /* - * At this point the superblock has not been read - * in, therefore we do not know the block size. - * Before the mount call ends we will convert - * these to FSBs. - */ - mp->m_dalign = ap->sunit; - mp->m_swidth = ap->swidth; - } - - if (ap->logbufs != -1 && - ap->logbufs != 0 && - (ap->logbufs < XLOG_MIN_ICLOGS || - ap->logbufs > XLOG_MAX_ICLOGS)) { - cmn_err(CE_WARN, - "XFS: invalid logbufs value: %d [not %d-%d]", - ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return XFS_ERROR(EINVAL); - } - mp->m_logbufs = ap->logbufs; - if (ap->logbufsize != -1 && - ap->logbufsize != 0 && - ap->logbufsize != 16 * 1024 && - ap->logbufsize != 32 * 1024 && - ap->logbufsize != 64 * 1024 && - ap->logbufsize != 128 * 1024 && - ap->logbufsize != 256 * 1024) { - cmn_err(CE_WARN, - "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", - ap->logbufsize); - return XFS_ERROR(EINVAL); - } - mp->m_ihsize = ap->ihashsize; - mp->m_logbsize = ap->logbufsize; - mp->m_fsname_len = strlen(ap->fsname) + 1; - mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP); - strcpy(mp->m_fsname, ap->fsname); - if (ap->rtname[0]) { - mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP); - strcpy(mp->m_rtname, ap->rtname); - } - if (ap->logname[0]) { - mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP); - strcpy(mp->m_logname, ap->logname); - } - - if (ap->flags & XFSMNT_WSYNC) - mp->m_flags |= XFS_MOUNT_WSYNC; -#if XFS_BIG_INUMS - if (ap->flags & XFSMNT_INO64) { - mp->m_flags |= XFS_MOUNT_INO64; - mp->m_inoadd = XFS_INO64_OFFSET; - } -#endif - if (ap->flags & XFSMNT_RETERR) - mp->m_flags |= XFS_MOUNT_RETERR; - if (ap->flags & XFSMNT_NOALIGN) - mp->m_flags |= XFS_MOUNT_NOALIGN; - if (ap->flags & XFSMNT_SWALLOC) - mp->m_flags |= XFS_MOUNT_SWALLOC; - if (ap->flags & XFSMNT_OSYNCISOSYNC) - mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; - if (ap->flags & XFSMNT_32BITINODES) - mp->m_flags |= XFS_MOUNT_32BITINODES; - - if (ap->flags & XFSMNT_IOSIZE) { - if (ap->iosizelog > XFS_MAX_IO_LOG || - ap->iosizelog < XFS_MIN_IO_LOG) { - cmn_err(CE_WARN, - "XFS: invalid log iosize: %d [not %d-%d]", - ap->iosizelog, XFS_MIN_IO_LOG, - XFS_MAX_IO_LOG); - return XFS_ERROR(EINVAL); - } - - mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; - mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; - } - - if (ap->flags & XFSMNT_IHASHSIZE) - mp->m_flags |= XFS_MOUNT_IHASHSIZE; - if (ap->flags & XFSMNT_IDELETE) - mp->m_flags |= XFS_MOUNT_IDELETE; - if (ap->flags & XFSMNT_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (ap->flags & XFSMNT_ATTR2) - mp->m_flags |= XFS_MOUNT_ATTR2; - - if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - - /* - * no recovery flag requires a read-only mount - */ - if (ap->flags & XFSMNT_NORECOVERY) { - if (!(vfs->vfs_flag & VFS_RDONLY)) { - cmn_err(CE_WARN, - "XFS: tried to mount a FS read-write without recovery!"); - return XFS_ERROR(EINVAL); - } - mp->m_flags |= XFS_MOUNT_NORECOVERY; - } - - if (ap->flags & XFSMNT_NOUUID) - mp->m_flags |= XFS_MOUNT_NOUUID; - if (ap->flags & XFSMNT_BARRIER) - mp->m_flags |= XFS_MOUNT_BARRIER; - else - mp->m_flags &= ~XFS_MOUNT_BARRIER; - - return 0; -} - -/* - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock _has_ now been read in. - */ -STATIC int -xfs_finish_flags( - struct bhv_vfs *vfs, - struct xfs_mount_args *ap, - struct xfs_mount *mp) -{ - int ronly = (vfs->vfs_flag & VFS_RDONLY); - - /* Fail a mount where the logbuf is smaller then the log stripe */ - if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { - if ((ap->logbufsize <= 0) && - (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { - mp->m_logbsize = mp->m_sb.sb_logsunit; - } else if (ap->logbufsize > 0 && - ap->logbufsize < mp->m_sb.sb_logsunit) { - cmn_err(CE_WARN, - "XFS: logbuf size must be greater than or equal to log stripe size"); - return XFS_ERROR(EINVAL); - } - } else { - /* Fail a mount if the logbuf is larger than 32K */ - if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { - cmn_err(CE_WARN, - "XFS: logbuf size for version 1 logs must be 16K or 32K"); - return XFS_ERROR(EINVAL); - } - } - - if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { - mp->m_flags |= XFS_MOUNT_ATTR2; - } - - /* - * prohibit r/w mounts of read-only filesystems - */ - if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { - cmn_err(CE_WARN, - "XFS: cannot mount a read-only filesystem as read-write"); - return XFS_ERROR(EROFS); - } - - /* - * check for shared mount. - */ - if (ap->flags & XFSMNT_SHARED) { - if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb)) - return XFS_ERROR(EINVAL); - - /* - * For IRIX 6.5, shared mounts must have the shared - * version bit set, have the persistent readonly - * field set, must be version 0 and can only be mounted - * read-only. - */ - if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) || - (mp->m_sb.sb_shared_vn != 0)) - return XFS_ERROR(EINVAL); - - mp->m_flags |= XFS_MOUNT_SHARED; - - /* - * Shared XFS V0 can't deal with DMI. Return EINVAL. - */ - if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) - return XFS_ERROR(EINVAL); - } - - return 0; -} - -/* - * xfs_mount - * - * The file system configurations are: - * (1) device (partition) with data and internal log - * (2) logical volume with data and log subvolumes. - * (3) logical volume with data, log, and realtime subvolumes. - * - * We only have to handle opening the log and realtime volumes here if - * they are present. The data subvolume has already been opened by - * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev. - */ -STATIC int -xfs_mount( - struct bhv_desc *bhvp, - struct xfs_mount_args *args, - cred_t *credp) -{ - struct bhv_vfs *vfsp = bhvtovfs(bhvp); - struct bhv_desc *p; - struct xfs_mount *mp = XFS_BHVTOM(bhvp); - struct block_device *ddev, *logdev, *rtdev; - int flags = 0, error; - - ddev = vfsp->vfs_super->s_bdev; - logdev = rtdev = NULL; - - /* - * Setup xfs_mount function vectors from available behaviors - */ - p = vfs_bhv_lookup(vfsp, VFS_POSITION_DM); - mp->m_dm_ops = p ? *(xfs_dmops_t *) vfs_bhv_custom(p) : xfs_dmcore_stub; - p = vfs_bhv_lookup(vfsp, VFS_POSITION_QM); - mp->m_qm_ops = p ? *(xfs_qmops_t *) vfs_bhv_custom(p) : xfs_qmcore_stub; - p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO); - mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs; - - if (args->flags & XFSMNT_QUIET) - flags |= XFS_MFSI_QUIET; - - /* - * Open real time and log devices - order is important. - */ - if (args->logname[0]) { - error = xfs_blkdev_get(mp, args->logname, &logdev); - if (error) - return error; - } - if (args->rtname[0]) { - error = xfs_blkdev_get(mp, args->rtname, &rtdev); - if (error) { - xfs_blkdev_put(logdev); - return error; - } - - if (rtdev == ddev || rtdev == logdev) { - cmn_err(CE_WARN, - "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - return EINVAL; - } - } - - /* - * Setup xfs_mount buffer target pointers - */ - error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); - if (!mp->m_ddev_targp) { - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - return error; - } - if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); - if (!mp->m_rtdev_targp) - goto error0; - } - mp->m_logdev_targp = (logdev && logdev != ddev) ? - xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp; - if (!mp->m_logdev_targp) - goto error0; - - /* - * Setup flags based on mount(2) options and then the superblock - */ - error = xfs_start_flags(vfsp, args, mp); - if (error) - goto error1; - error = xfs_readsb(mp, flags); - if (error) - goto error1; - error = xfs_finish_flags(vfsp, args, mp); - if (error) - goto error2; - - /* - * Setup xfs_mount buffer target pointers based on superblock - */ - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); - if (!error && logdev && logdev != ddev) { - unsigned int log_sector_size = BBSIZE; - - if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) - log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, - mp->m_sb.sb_blocksize, - log_sector_size); - } - if (!error && rtdev) - error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); - if (error) - goto error2; - - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); - - error = XFS_IOINIT(vfsp, args, flags); - if (error) - goto error2; - - return 0; - -error2: - if (mp->m_sb_bp) - xfs_freesb(mp); -error1: - xfs_binval(mp->m_ddev_targp); - if (logdev && logdev != ddev) - xfs_binval(mp->m_logdev_targp); - if (rtdev) - xfs_binval(mp->m_rtdev_targp); -error0: - xfs_unmountfs_close(mp, credp); - return error; -} - -STATIC int -xfs_unmount( - bhv_desc_t *bdp, - int flags, - cred_t *credp) -{ - bhv_vfs_t *vfsp = bhvtovfs(bdp); - xfs_mount_t *mp = XFS_BHVTOM(bdp); - xfs_inode_t *rip; - bhv_vnode_t *rvp; - int unmount_event_wanted = 0; - int unmount_event_flags = 0; - int xfs_unmountfs_needed = 0; - int error; - - rip = mp->m_rootip; - rvp = XFS_ITOV(rip); - - if (vfsp->vfs_flag & VFS_DMI) { - error = XFS_SEND_PREUNMOUNT(mp, vfsp, - rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL, - NULL, NULL, 0, 0, - (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))? - 0:DM_FLAGS_UNWANTED); - if (error) - return XFS_ERROR(error); - unmount_event_wanted = 1; - unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))? - 0 : DM_FLAGS_UNWANTED; - } - - /* - * First blow any referenced inode from this file system - * out of the reference cache, and delete the timer. - */ - xfs_refcache_purge_mp(mp); - - XFS_bflush(mp->m_ddev_targp); - error = xfs_unmount_flush(mp, 0); - if (error) - goto out; - - ASSERT(vn_count(rvp) == 1); - - /* - * Drop the reference count - */ - VN_RELE(rvp); - - /* - * If we're forcing a shutdown, typically because of a media error, - * we want to make sure we invalidate dirty pages that belong to - * referenced vnodes as well. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = xfs_sync(&mp->m_bhv, - (SYNC_WAIT | SYNC_CLOSE), credp); - ASSERT(error != EFSCORRUPTED); - } - xfs_unmountfs_needed = 1; - -out: - /* Send DMAPI event, if required. - * Then do xfs_unmountfs() if needed. - * Then return error (or zero). - */ - if (unmount_event_wanted) { - /* Note: mp structure must still exist for - * XFS_SEND_UNMOUNT() call. - */ - XFS_SEND_UNMOUNT(mp, vfsp, error == 0 ? rvp : NULL, - DM_RIGHT_NULL, 0, error, unmount_event_flags); - } - if (xfs_unmountfs_needed) { - /* - * Call common unmount function to flush to disk - * and free the super block buffer & mount structures. - */ - xfs_unmountfs(mp, credp); - } - - return XFS_ERROR(error); -} - -STATIC int +STATIC void xfs_quiesce_fs( xfs_mount_t *mp) { int count = 0, pincount; - xfs_refcache_purge_mp(mp); xfs_flush_buftarg(mp->m_ddev_targp, 0); xfs_finish_reclaim_all(mp, 0); @@ -640,43 +75,43 @@ xfs_quiesce_fs( * we can write the unmount record. */ do { - xfs_syncsub(mp, SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT, 0, NULL); + xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL); pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); if (!pincount) { delay(50); count++; } } while (count < 2); - - return 0; } -STATIC int -xfs_mntupdate( - bhv_desc_t *bdp, - int *flags, - struct xfs_mount_args *args) +/* + * Second stage of a quiesce. The data is already synced, now we have to take + * care of the metadata. New transactions are already blocked, so we need to + * wait for any remaining transactions to drain out before proceding. + */ +void +xfs_attr_quiesce( + xfs_mount_t *mp) { - bhv_vfs_t *vfsp = bhvtovfs(bdp); - xfs_mount_t *mp = XFS_BHVTOM(bdp); - - if (!(*flags & MS_RDONLY)) { /* rw/ro -> rw */ - if (vfsp->vfs_flag & VFS_RDONLY) - vfsp->vfs_flag &= ~VFS_RDONLY; - if (args->flags & XFSMNT_BARRIER) { - mp->m_flags |= XFS_MOUNT_BARRIER; - xfs_mountfs_check_barriers(mp); - } else { - mp->m_flags &= ~XFS_MOUNT_BARRIER; - } - } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ - bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL); - xfs_quiesce_fs(mp); - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); - vfsp->vfs_flag |= VFS_RDONLY; - } - return 0; + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* flush inodes and push all remaining buffers out to disk */ + xfs_quiesce_fs(mp); + + ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp, 1); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); } /* @@ -693,10 +128,9 @@ xfs_unmount_flush( xfs_inode_t *rip = mp->m_rootip; xfs_inode_t *rbmip; xfs_inode_t *rsumip = NULL; - bhv_vnode_t *rvp = XFS_ITOV(rip); int error; - xfs_ilock(rip, XFS_ILOCK_EXCL); + xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); xfs_iflock(rip); /* @@ -711,7 +145,7 @@ xfs_unmount_flush( if (error == EFSCORRUPTED) goto fscorrupt_out; - ASSERT(vn_count(XFS_ITOV(rbmip)) == 1); + ASSERT(vn_count(VFS_I(rbmip)) == 1); rsumip = mp->m_rsumip; xfs_ilock(rsumip, XFS_ILOCK_EXCL); @@ -722,7 +156,7 @@ xfs_unmount_flush( if (error == EFSCORRUPTED) goto fscorrupt_out; - ASSERT(vn_count(XFS_ITOV(rsumip)) == 1); + ASSERT(vn_count(VFS_I(rsumip)) == 1); } /* @@ -732,7 +166,7 @@ xfs_unmount_flush( if (error == EFSCORRUPTED) goto fscorrupt_out2; - if (vn_count(rvp) != 1 && !relocation) { + if (vn_count(VFS_I(rip)) != 1 && !relocation) { xfs_iunlock(rip, XFS_ILOCK_EXCL); return XFS_ERROR(EBUSY); } @@ -746,8 +180,8 @@ xfs_unmount_flush( goto fscorrupt_out2; if (rbmip) { - VN_RELE(XFS_ITOV(rbmip)); - VN_RELE(XFS_ITOV(rsumip)); + IRELE(rbmip); + IRELE(rsumip); } xfs_iunlock(rip, XFS_ILOCK_EXCL); @@ -763,80 +197,6 @@ fscorrupt_out2: } /* - * xfs_root extracts the root vnode from a vfs. - * - * vfsp -- the vfs struct for the desired file system - * vpp -- address of the caller's vnode pointer which should be - * set to the desired fs root vnode - */ -STATIC int -xfs_root( - bhv_desc_t *bdp, - bhv_vnode_t **vpp) -{ - bhv_vnode_t *vp; - - vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip); - VN_HOLD(vp); - *vpp = vp; - return 0; -} - -/* - * xfs_statvfs - * - * Fill in the statvfs structure for the given file system. We use - * the superblock lock in the mount structure to ensure a consistent - * snapshot of the counters returned. - */ -STATIC int -xfs_statvfs( - bhv_desc_t *bdp, - bhv_statvfs_t *statp, - bhv_vnode_t *vp) -{ - __uint64_t fakeinos; - xfs_extlen_t lsize; - xfs_mount_t *mp; - xfs_sb_t *sbp; - unsigned long s; - - mp = XFS_BHVTOM(bdp); - sbp = &(mp->m_sb); - - statp->f_type = XFS_SB_MAGIC; - - xfs_icsb_sync_counters_lazy(mp); - s = XFS_SB_LOCK(mp); - statp->f_bsize = sbp->sb_blocksize; - lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; - statp->f_blocks = sbp->sb_dblocks - lsize; - statp->f_bfree = statp->f_bavail = - sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - fakeinos = statp->f_bfree << sbp->sb_inopblog; -#if XFS_BIG_INUMS - fakeinos += mp->m_inoadd; -#endif - statp->f_files = - MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); - if (mp->m_maxicount) -#if XFS_BIG_INUMS - if (!mp->m_inoadd) -#endif - statp->f_files = min_t(typeof(statp->f_files), - statp->f_files, - mp->m_maxicount); - statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); - XFS_SB_UNLOCK(mp, s); - - xfs_statvfs_fsid(statp, mp); - statp->f_namelen = MAXNAMELEN - 1; - - return 0; -} - - -/* * xfs_sync flushes any pending I/O to file system vfsp. * * This routine is called by vfs_sync() to make sure that things make it @@ -872,44 +232,58 @@ xfs_statvfs( * this by simply making sure the log gets flushed * if SYNC_BDFLUSH is set, and by actually writing it * out otherwise. + * SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete + * before we return (including direct I/O). Forms the drain + * side of the write barrier needed to safely quiesce the + * filesystem. * */ -/*ARGSUSED*/ -STATIC int +int xfs_sync( - bhv_desc_t *bdp, - int flags, - cred_t *credp) + xfs_mount_t *mp, + int flags) { - xfs_mount_t *mp = XFS_BHVTOM(bdp); + int error; - if (unlikely(flags == SYNC_QUIESCE)) - return xfs_quiesce_fs(mp); - else - return xfs_syncsub(mp, flags, 0, NULL); + /* + * Get the Quota Manager to flush the dquots. + * + * If XFS quota support is not enabled or this filesystem + * instance does not use quotas XFS_QM_DQSYNC will always + * return zero. + */ + error = XFS_QM_DQSYNC(mp, flags); + if (error) { + /* + * If we got an IO error, we will be shutting down. + * So, there's nothing more for us to do here. + */ + ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp)); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(error); + } + + if (flags & SYNC_IOWAIT) + xfs_filestream_flush(mp); + + return xfs_syncsub(mp, flags, NULL); } /* * xfs sync routine for internal use * * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internally- - * required functions are specified within a separate xflags parameter, - * only available by calling this routine. + * interface as explained above under xfs_sync. * */ int xfs_sync_inodes( xfs_mount_t *mp, int flags, - int xflags, int *bypassed) { xfs_inode_t *ip = NULL; - xfs_inode_t *ip_next; - xfs_buf_t *bp; - bhv_vnode_t *vp = NULL; + struct inode *vp = NULL; int error; int last_error; uint64_t fflag; @@ -918,7 +292,6 @@ xfs_sync_inodes( boolean_t mount_locked; boolean_t vnode_refed; int preempt; - xfs_dinode_t *dip; xfs_iptr_t *ipointer; #ifdef DEBUG boolean_t ipointer_in = B_FALSE; @@ -971,9 +344,11 @@ xfs_sync_inodes( #define XFS_PREEMPT_MASK 0x7f + ASSERT(!(flags & SYNC_BDFLUSH)); + if (bypassed) *bypassed = 0; - if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; error = 0; last_error = 0; @@ -983,7 +358,7 @@ xfs_sync_inodes( ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP); fflag = XFS_B_ASYNC; /* default is don't wait */ - if (flags & (SYNC_BDFLUSH | SYNC_DELWRI)) + if (flags & SYNC_DELWRI) fflag = XFS_B_DELWRI; if (flags & SYNC_WAIT) fflag = 0; /* synchronous overrides all */ @@ -1028,7 +403,7 @@ xfs_sync_inodes( continue; } - vp = XFS_ITOV_NULL(ip); + vp = VFS_I(ip); /* * If the vnode is gone then this is being torn down, @@ -1068,29 +443,11 @@ xfs_sync_inodes( if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { XFS_MOUNT_IUNLOCK(mp); - kmem_free(ipointer, sizeof(xfs_iptr_t)); + kmem_free(ipointer); return 0; } /* - * If this is just vfs_sync() or pflushd() calling - * then we can skip inodes for which it looks like - * there is nothing to do. Since we don't have the - * inode locked this is racy, but these are periodic - * calls so it doesn't matter. For the others we want - * to know for sure, so we at least try to lock them. - */ - if (flags & SYNC_BDFLUSH) { - if (((ip->i_itemp == NULL) || - !(ip->i_itemp->ili_format.ilf_fields & - XFS_ILOG_ALL)) && - (ip->i_update_core == 0)) { - ip = ip->i_mnext; - continue; - } - } - - /* * Try to lock without sleeping. We're out of order with * the inode list lock here, so if we fail we need to drop * the mount lock and try again. If we're called from @@ -1107,7 +464,7 @@ xfs_sync_inodes( * it. */ if (xfs_ilock_nowait(ip, lock_flags) == 0) { - if ((flags & SYNC_BDFLUSH) || (vp == NULL)) { + if (vp == NULL) { ip = ip->i_mnext; continue; } @@ -1121,7 +478,7 @@ xfs_sync_inodes( IPOINTER_INSERT(ip, mp); xfs_ilock(ip, lock_flags); - ASSERT(vp == XFS_ITOV(ip)); + ASSERT(vp == VFS_I(ip)); ASSERT(ip->i_mount == mp); vnode_refed = B_TRUE; @@ -1131,206 +488,64 @@ xfs_sync_inodes( * in the inode list. */ - if ((flags & SYNC_CLOSE) && (vp != NULL)) { - /* - * This is the shutdown case. We just need to - * flush and invalidate all the pages associated - * with the inode. Drop the inode lock since - * we can't hold it across calls to the buffer - * cache. - * - * We don't set the VREMAPPING bit in the vnode - * here, because we don't hold the vnode lock - * exclusively. It doesn't really matter, though, - * because we only come here when we're shutting - * down anyway. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (XFS_FORCED_SHUTDOWN(mp)) { - bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); - } else { - bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF); + /* + * If we have to flush data or wait for I/O completion + * we need to drop the ilock that we currently hold. + * If we need to drop the lock, insert a marker if we + * have not already done so. + */ + if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) || + ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) { + if (mount_locked) { + IPOINTER_INSERT(ip, mp); } + xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_ilock(ip, XFS_ILOCK_SHARED); - - } else if ((flags & SYNC_DELWRI) && (vp != NULL)) { - if (VN_DIRTY(vp)) { - /* We need to have dropped the lock here, - * so insert a marker if we have not already - * done so. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - /* - * Drop the inode lock since we can't hold it - * across calls to the buffer cache. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - error = bhv_vop_flush_pages(vp, (xfs_off_t)0, + if (flags & SYNC_CLOSE) { + /* Shutdown case. Flush and invalidate. */ + if (XFS_FORCED_SHUTDOWN(mp)) + xfs_tosspages(ip, 0, -1, + FI_REMAPF); + else + error = xfs_flushinval_pages(ip, + 0, -1, FI_REMAPF); + } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) { + error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE); - xfs_ilock(ip, XFS_ILOCK_SHARED); } - } - - if (flags & SYNC_BDFLUSH) { - if ((flags & SYNC_ATTR) && - ((ip->i_update_core) || - ((ip->i_itemp != NULL) && - (ip->i_itemp->ili_format.ilf_fields != 0)))) { - - /* Insert marker and drop lock if not already - * done. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - /* - * We don't want the periodic flushing of the - * inodes by vfs_sync() to interfere with - * I/O to the file, especially read I/O - * where it is only the access time stamp - * that is being flushed out. To prevent - * long periods where we have both inode - * locks held shared here while reading the - * inode's buffer in from disk, we drop the - * inode lock while reading in the inode - * buffer. We have to release the buffer - * and reacquire the inode lock so that they - * are acquired in the proper order (inode - * locks first). The buffer will go at the - * end of the lru chain, though, so we can - * expect it to still be there when we go - * for it again in xfs_iflush(). - */ - if ((xfs_ipincount(ip) == 0) && - xfs_iflock_nowait(ip)) { - - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - error = xfs_itobp(mp, NULL, ip, - &dip, &bp, 0, 0); - if (!error) { - xfs_buf_relse(bp); - } else { - /* Bailing out, remove the - * marker and free it. - */ - XFS_MOUNT_ILOCK(mp); - IPOINTER_REMOVE(ip, mp); - XFS_MOUNT_IUNLOCK(mp); - - ASSERT(!(lock_flags & - XFS_IOLOCK_SHARED)); - - kmem_free(ipointer, - sizeof(xfs_iptr_t)); - return (0); - } - - /* - * Since we dropped the inode lock, - * the inode may have been reclaimed. - * Therefore, we reacquire the mount - * lock and check to see if we were the - * inode reclaimed. If this happened - * then the ipointer marker will no - * longer point back at us. In this - * case, move ip along to the inode - * after the marker, remove the marker - * and continue. - */ - XFS_MOUNT_ILOCK(mp); - mount_locked = B_TRUE; - - if (ip != ipointer->ip_mprev) { - IPOINTER_REMOVE(ip, mp); - - ASSERT(!vnode_refed); - ASSERT(!(lock_flags & - XFS_IOLOCK_SHARED)); - continue; - } + /* + * When freezing, we need to wait ensure all I/O (including direct + * I/O) is complete to ensure no further data modification can take + * place after this point + */ + if (flags & SYNC_IOWAIT) + vn_iowait(ip); - ASSERT(ip->i_mount == mp); - - if (xfs_ilock_nowait(ip, - XFS_ILOCK_SHARED) == 0) { - ASSERT(ip->i_mount == mp); - /* - * We failed to reacquire - * the inode lock without - * sleeping, so just skip - * the inode for now. We - * clear the ILOCK bit from - * the lock_flags so that we - * won't try to drop a lock - * we don't hold below. - */ - lock_flags &= ~XFS_ILOCK_SHARED; - IPOINTER_REMOVE(ip_next, mp); - } else if ((xfs_ipincount(ip) == 0) && - xfs_iflock_nowait(ip)) { - ASSERT(ip->i_mount == mp); - /* - * Since this is vfs_sync() - * calling we only flush the - * inode out if we can lock - * it without sleeping and - * it is not pinned. Drop - * the mount lock here so - * that we don't hold it for - * too long. We already have - * a marker in the list here. - */ - XFS_MOUNT_IUNLOCK(mp); - mount_locked = B_FALSE; - error = xfs_iflush(ip, - XFS_IFLUSH_DELWRI); - } else { - ASSERT(ip->i_mount == mp); - IPOINTER_REMOVE(ip_next, mp); - } - } + xfs_ilock(ip, XFS_ILOCK_SHARED); + } - } + if ((flags & SYNC_ATTR) && + (ip->i_update_core || + (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) { + if (mount_locked) + IPOINTER_INSERT(ip, mp); - } else { - if ((flags & SYNC_ATTR) && - ((ip->i_update_core) || - ((ip->i_itemp != NULL) && - (ip->i_itemp->ili_format.ilf_fields != 0)))) { - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } + if (flags & SYNC_WAIT) { + xfs_iflock(ip); + error = xfs_iflush(ip, XFS_IFLUSH_SYNC); - if (flags & SYNC_WAIT) { - xfs_iflock(ip); - error = xfs_iflush(ip, - XFS_IFLUSH_SYNC); - } else { - /* - * If we can't acquire the flush - * lock, then the inode is already - * being flushed so don't bother - * waiting. If we can lock it then - * do a delwri flush so we can - * combine multiple inode flushes - * in each disk write. - */ - if (xfs_iflock_nowait(ip)) { - error = xfs_iflush(ip, - XFS_IFLUSH_DELWRI); - } - else if (bypassed) - (*bypassed)++; - } + /* + * If we can't acquire the flush lock, then the inode + * is already being flushed so don't bother waiting. + * + * If we can lock it then do a delwri flush so we can + * combine multiple inode flushes in each disk write. + */ + } else if (xfs_iflock_nowait(ip)) { + error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); + } else if (bypassed) { + (*bypassed)++; } } @@ -1344,10 +559,10 @@ xfs_sync_inodes( * above, then wait until after we've unlocked * the inode to release the reference. This is * because we can be already holding the inode - * lock when VN_RELE() calls xfs_inactive(). + * lock when IRELE() calls xfs_inactive(). * * Make sure to drop the mount lock before calling - * VN_RELE() so that we don't trip over ourselves if + * IRELE() so that we don't trip over ourselves if * we have to go for the mount lock again in the * inactive code. */ @@ -1355,7 +570,7 @@ xfs_sync_inodes( IPOINTER_INSERT(ip, mp); } - VN_RELE(vp); + IRELE(ip); vnode_refed = B_FALSE; } @@ -1374,7 +589,7 @@ xfs_sync_inodes( } XFS_MOUNT_IUNLOCK(mp); ASSERT(ipointer_in == B_FALSE); - kmem_free(ipointer, sizeof(xfs_iptr_t)); + kmem_free(ipointer); return XFS_ERROR(error); } @@ -1404,7 +619,7 @@ xfs_sync_inodes( ASSERT(ipointer_in == B_FALSE); - kmem_free(ipointer, sizeof(xfs_iptr_t)); + kmem_free(ipointer); return XFS_ERROR(last_error); } @@ -1412,17 +627,13 @@ xfs_sync_inodes( * xfs sync routine for internal use * * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internally- - * required functions are specified within a separate xflags parameter, - * only available by calling this routine. + * interface as explained above under xfs_sync. * */ int xfs_syncsub( xfs_mount_t *mp, int flags, - int xflags, int *bypassed) { int error = 0; @@ -1444,7 +655,7 @@ xfs_syncsub( if (flags & SYNC_BDFLUSH) xfs_finish_reclaim_all(mp, 1); else - error = xfs_sync_inodes(mp, flags, xflags, bypassed); + error = xfs_sync_inodes(mp, flags, bypassed); } /* @@ -1502,21 +713,8 @@ xfs_syncsub( } /* - * If this is the periodic sync, then kick some entries out of - * the reference cache. This ensures that idle entries are - * eventually kicked out of the cache. - */ - if (flags & SYNC_REFCACHE) { - if (flags & SYNC_WAIT) - xfs_refcache_purge_mp(mp); - else - xfs_refcache_purge_some(mp); - } - - /* * Now check to see if the log needs a "dummy" transaction. */ - if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) { xfs_trans_t *tp; xfs_inode_t *ip; @@ -1539,7 +737,7 @@ xfs_syncsub( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_log_force(mp, (xfs_lsn_t)0, log_flags); } @@ -1557,437 +755,3 @@ xfs_syncsub( return XFS_ERROR(last_error); } - -/* - * xfs_vget - called by DMAPI and NFSD to get vnode from file handle - */ -STATIC int -xfs_vget( - bhv_desc_t *bdp, - bhv_vnode_t **vpp, - fid_t *fidp) -{ - xfs_mount_t *mp = XFS_BHVTOM(bdp); - xfs_fid_t *xfid = (struct xfs_fid *)fidp; - xfs_inode_t *ip; - int error; - xfs_ino_t ino; - unsigned int igen; - - /* - * Invalid. Since handles can be created in user space and passed in - * via gethandle(), this is not cause for a panic. - */ - if (xfid->xfs_fid_len != sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) - return XFS_ERROR(EINVAL); - - ino = xfid->xfs_fid_ino; - igen = xfid->xfs_fid_gen; - - /* - * NFS can sometimes send requests for ino 0. Fail them gracefully. - */ - if (ino == 0) - return XFS_ERROR(ESTALE); - - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); - if (error) { - *vpp = NULL; - return error; - } - - if (ip == NULL) { - *vpp = NULL; - return XFS_ERROR(EIO); - } - - if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); - *vpp = NULL; - return XFS_ERROR(ENOENT); - } - - *vpp = XFS_ITOV(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return 0; -} - - -#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ -#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ -#define MNTOPT_LOGDEV "logdev" /* log device */ -#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ -#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ -#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ -#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */ -#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ -#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ -#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ -#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ -#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ -#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ -#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ -#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ -#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ -#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ -#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ -#define MNTOPT_IHASHSIZE "ihashsize" /* size of inode hash table */ -#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ -#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and - * unwritten extent conversion */ -#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ -#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ -#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ -#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ -#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ -#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ -#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes - * in stat(). */ -#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ -#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ - -STATIC unsigned long -suffix_strtoul(char *s, char **endp, unsigned int base) -{ - int last, shift_left_factor = 0; - char *value = s; - - last = strlen(value) - 1; - if (value[last] == 'K' || value[last] == 'k') { - shift_left_factor = 10; - value[last] = '\0'; - } - if (value[last] == 'M' || value[last] == 'm') { - shift_left_factor = 20; - value[last] = '\0'; - } - if (value[last] == 'G' || value[last] == 'g') { - shift_left_factor = 30; - value[last] = '\0'; - } - - return simple_strtoul((const char *)s, endp, base) << shift_left_factor; -} - -STATIC int -xfs_parseargs( - struct bhv_desc *bhv, - char *options, - struct xfs_mount_args *args, - int update) -{ - bhv_vfs_t *vfsp = bhvtovfs(bhv); - char *this_char, *value, *eov; - int dsunit, dswidth, vol_dsunit, vol_dswidth; - int iosize; - - args->flags |= XFSMNT_IDELETE; - args->flags |= XFSMNT_BARRIER; - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; - - if (!options) - goto done; - - iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0; - - while ((this_char = strsep(&options, ",")) != NULL) { - if (!*this_char) - continue; - if ((value = strchr(this_char, '=')) != NULL) - *value++ = 0; - - if (!strcmp(this_char, MNTOPT_LOGBUFS)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - args->logbufs = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - args->logbufsize = suffix_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - strncpy(args->logname, value, MAXNAMELEN); - } else if (!strcmp(this_char, MNTOPT_MTPT)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - strncpy(args->mtpt, value, MAXNAMELEN); - } else if (!strcmp(this_char, MNTOPT_RTDEV)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - strncpy(args->rtname, value, MAXNAMELEN); - } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - iosize = simple_strtoul(value, &eov, 10); - args->flags |= XFSMNT_IOSIZE; - args->iosizelog = (uint8_t) iosize; - } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - iosize = suffix_strtoul(value, &eov, 10); - args->flags |= XFSMNT_IOSIZE; - args->iosizelog = ffs(iosize) - 1; - } else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - args->flags |= XFSMNT_IHASHSIZE; - args->ihashsize = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_GRPID) || - !strcmp(this_char, MNTOPT_BSDGROUPS)) { - vfsp->vfs_flag |= VFS_GRPID; - } else if (!strcmp(this_char, MNTOPT_NOGRPID) || - !strcmp(this_char, MNTOPT_SYSVGROUPS)) { - vfsp->vfs_flag &= ~VFS_GRPID; - } else if (!strcmp(this_char, MNTOPT_WSYNC)) { - args->flags |= XFSMNT_WSYNC; - } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { - args->flags |= XFSMNT_OSYNCISOSYNC; - } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { - args->flags |= XFSMNT_NORECOVERY; - } else if (!strcmp(this_char, MNTOPT_INO64)) { - args->flags |= XFSMNT_INO64; -#if !XFS_BIG_INUMS - cmn_err(CE_WARN, - "XFS: %s option not allowed on this system", - this_char); - return EINVAL; -#endif - } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { - args->flags |= XFSMNT_NOALIGN; - } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { - args->flags |= XFSMNT_SWALLOC; - } else if (!strcmp(this_char, MNTOPT_SUNIT)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - dsunit = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - dswidth = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { - args->flags &= ~XFSMNT_32BITINODES; -#if !XFS_BIG_INUMS - cmn_err(CE_WARN, - "XFS: %s option not allowed on this system", - this_char); - return EINVAL; -#endif - } else if (!strcmp(this_char, MNTOPT_NOUUID)) { - args->flags |= XFSMNT_NOUUID; - } else if (!strcmp(this_char, MNTOPT_BARRIER)) { - args->flags |= XFSMNT_BARRIER; - } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { - args->flags &= ~XFSMNT_BARRIER; - } else if (!strcmp(this_char, MNTOPT_IKEEP)) { - args->flags &= ~XFSMNT_IDELETE; - } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { - args->flags |= XFSMNT_IDELETE; - } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { - args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE; - } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; - } else if (!strcmp(this_char, MNTOPT_ATTR2)) { - args->flags |= XFSMNT_ATTR2; - } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { - args->flags &= ~XFSMNT_ATTR2; - } else if (!strcmp(this_char, "osyncisdsync")) { - /* no-op, this is now the default */ - cmn_err(CE_WARN, - "XFS: osyncisdsync is now the default, option is deprecated."); - } else if (!strcmp(this_char, "irixsgid")) { - cmn_err(CE_WARN, - "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); - } else { - cmn_err(CE_WARN, - "XFS: unknown mount option [%s].", this_char); - return EINVAL; - } - } - - if (args->flags & XFSMNT_NORECOVERY) { - if ((vfsp->vfs_flag & VFS_RDONLY) == 0) { - cmn_err(CE_WARN, - "XFS: no-recovery mounts must be read-only."); - return EINVAL; - } - } - - if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) { - cmn_err(CE_WARN, - "XFS: sunit and swidth options incompatible with the noalign option"); - return EINVAL; - } - - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { - cmn_err(CE_WARN, - "XFS: sunit and swidth must be specified together"); - return EINVAL; - } - - if (dsunit && (dswidth % dsunit != 0)) { - cmn_err(CE_WARN, - "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", - dswidth, dsunit); - return EINVAL; - } - - if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { - if (dsunit) { - args->sunit = dsunit; - args->flags |= XFSMNT_RETERR; - } else { - args->sunit = vol_dsunit; - } - dswidth ? (args->swidth = dswidth) : - (args->swidth = vol_dswidth); - } else { - args->sunit = args->swidth = 0; - } - -done: - if (args->flags & XFSMNT_32BITINODES) - vfsp->vfs_flag |= VFS_32BITINODES; - if (args->flags2) - args->flags |= XFSMNT_FLAGS2; - return 0; -} - -STATIC int -xfs_showargs( - struct bhv_desc *bhv, - struct seq_file *m) -{ - static struct proc_xfs_info { - int flag; - char *str; - } xfs_info[] = { - /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, - { XFS_MOUNT_INO64, "," MNTOPT_INO64 }, - { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, - { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, - { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, - { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, - { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, - { 0, NULL } - }; - struct proc_xfs_info *xfs_infop; - struct xfs_mount *mp = XFS_BHVTOM(bhv); - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); - - for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) { - if (mp->m_flags & xfs_infop->flag) - seq_puts(m, xfs_infop->str); - } - - if (mp->m_flags & XFS_MOUNT_IHASHSIZE) - seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize); - - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) - seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", - (int)(1 << mp->m_writeio_log) >> 10); - - if (mp->m_logbufs > 0) - seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); - if (mp->m_logbsize > 0) - seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); - - if (mp->m_logname) - seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); - if (mp->m_rtname) - seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); - - if (mp->m_dalign > 0) - seq_printf(m, "," MNTOPT_SUNIT "=%d", - (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); - if (mp->m_swidth > 0) - seq_printf(m, "," MNTOPT_SWIDTH "=%d", - (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); - - if (!(mp->m_flags & XFS_MOUNT_IDELETE)) - seq_printf(m, "," MNTOPT_IKEEP); - if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)) - seq_printf(m, "," MNTOPT_LARGEIO); - - if (!(vfsp->vfs_flag & VFS_32BITINODES)) - seq_printf(m, "," MNTOPT_64BITINODE); - if (vfsp->vfs_flag & VFS_GRPID) - seq_printf(m, "," MNTOPT_GRPID); - - return 0; -} - -STATIC void -xfs_freeze( - bhv_desc_t *bdp) -{ - xfs_mount_t *mp = XFS_BHVTOM(bdp); - - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* Push the superblock and write an unmount record */ - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); - xfs_fs_log_dummy(mp); -} - - -bhv_vfsops_t xfs_vfsops = { - BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS), - .vfs_parseargs = xfs_parseargs, - .vfs_showargs = xfs_showargs, - .vfs_mount = xfs_mount, - .vfs_unmount = xfs_unmount, - .vfs_mntupdate = xfs_mntupdate, - .vfs_root = xfs_root, - .vfs_statvfs = xfs_statvfs, - .vfs_sync = xfs_sync, - .vfs_vget = xfs_vget, - .vfs_dmapiops = (vfs_dmapiops_t)fs_nosys, - .vfs_quotactl = (vfs_quotactl_t)fs_nosys, - .vfs_init_vnode = xfs_initialize_vnode, - .vfs_force_shutdown = xfs_do_force_shutdown, - .vfs_freeze = xfs_freeze, -}; diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h new file mode 100644 index 0000000..a74b050 --- /dev/null +++ b/fs/xfs/xfs_vfsops.h @@ -0,0 +1,16 @@ +#ifndef _XFS_VFSOPS_H +#define _XFS_VFSOPS_H 1 + +struct cred; +struct xfs_fid; +struct inode; +struct kstatfs; +struct xfs_mount; +struct xfs_mount_args; + +int xfs_sync(struct xfs_mount *mp, int flags); +void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, + int lnnum); +void xfs_attr_quiesce(struct xfs_mount *mp); + +#endif /* _XFS_VFSOPS_H */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 23cfa58..8b6812f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -48,19 +48,16 @@ #include "xfs_quota.h" #include "xfs_utils.h" #include "xfs_rtalloc.h" -#include "xfs_refcache.h" #include "xfs_trans_space.h" #include "xfs_log_priv.h" -#include "xfs_mac.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" -STATIC int +int xfs_open( - bhv_desc_t *bdp, - cred_t *credp) + xfs_inode_t *ip) { int mode; - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - xfs_inode_t *ip = XFS_BHVTOI(bdp); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return XFS_ERROR(EIO); @@ -69,7 +66,7 @@ xfs_open( * If it's a directory with any blocks, read-ahead block 0 * as we're almost certain to have the next operation be a read there. */ - if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) { + if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) { mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); @@ -78,229 +75,35 @@ xfs_open( return 0; } -STATIC int -xfs_close( - bhv_desc_t *bdp, - int flags, - lastclose_t lastclose, - cred_t *credp) -{ - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - xfs_inode_t *ip = XFS_BHVTOI(bdp); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - if (lastclose != L_TRUE || !VN_ISREG(vp)) - return 0; - - /* - * If we previously truncated this file and removed old data in - * the process, we want to initiate "early" writeout on the last - * close. This is an attempt to combat the notorious NULL files - * problem which is particularly noticable from a truncate down, - * buffered (re-)write (delalloc), followed by a crash. What we - * are effectively doing here is significantly reducing the time - * window where we'd otherwise be exposed to that problem. - */ - if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0) - return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); - return 0; -} - -/* - * xfs_getattr - */ -STATIC int -xfs_getattr( - bhv_desc_t *bdp, - bhv_vattr_t *vap, - int flags, - cred_t *credp) -{ - xfs_inode_t *ip; - xfs_mount_t *mp; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - if (!(flags & ATTR_LAZY)) - xfs_ilock(ip, XFS_ILOCK_SHARED); - - vap->va_size = ip->i_d.di_size; - if (vap->va_mask == XFS_AT_SIZE) - goto all_done; - - vap->va_nblocks = - XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - vap->va_nodeid = ip->i_ino; -#if XFS_BIG_INUMS - vap->va_nodeid += mp->m_inoadd; -#endif - vap->va_nlink = ip->i_d.di_nlink; - - /* - * Quick exit for non-stat callers - */ - if ((vap->va_mask & - ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID| - XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0) - goto all_done; - - /* - * Copy from in-core inode. - */ - vap->va_mode = ip->i_d.di_mode; - vap->va_uid = ip->i_d.di_uid; - vap->va_gid = ip->i_d.di_gid; - vap->va_projid = ip->i_d.di_projid; - - /* - * Check vnode type block/char vs. everything else. - */ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - vap->va_rdev = ip->i_df.if_u2.if_rdev; - vap->va_blocksize = BLKDEV_IOSIZE; - break; - default: - vap->va_rdev = 0; - - if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { - vap->va_blocksize = xfs_preferred_iosize(mp); - } else { - - /* - * If the file blocks are being allocated from a - * realtime partition, then return the inode's - * realtime extent size or the realtime volume's - * extent size. - */ - vap->va_blocksize = ip->i_d.di_extsize ? - (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) : - (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog); - } - break; - } - - vn_atime_to_timespec(vp, &vap->va_atime); - vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - - /* - * Exit for stat callers. See if any of the rest of the fields - * to be filled in are needed. - */ - if ((vap->va_mask & - (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| - XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) - goto all_done; - - /* - * Convert di_flags to xflags. - */ - vap->va_xflags = xfs_ip2xflags(ip); - - /* - * Exit for inode revalidate. See if any of the rest of - * the fields to be filled in are needed. - */ - if ((vap->va_mask & - (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| - XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) - goto all_done; - - vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog; - vap->va_nextents = - (ip->i_df.if_flags & XFS_IFEXTENTS) ? - ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) : - ip->i_d.di_nextents; - if (ip->i_afp) - vap->va_anextents = - (ip->i_afp->if_flags & XFS_IFEXTENTS) ? - ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) : - ip->i_d.di_anextents; - else - vap->va_anextents = 0; - vap->va_gen = ip->i_d.di_gen; - - all_done: - if (!(flags & ATTR_LAZY)) - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return 0; -} - - -/* - * xfs_setattr - */ int xfs_setattr( - bhv_desc_t *bdp, - bhv_vattr_t *vap, + struct xfs_inode *ip, + struct iattr *iattr, int flags, cred_t *credp) { - xfs_inode_t *ip; + xfs_mount_t *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; xfs_trans_t *tp; - xfs_mount_t *mp; - int mask; int code; uint lock_flags; uint commit_flags=0; uid_t uid=0, iuid=0; gid_t gid=0, igid=0; int timeflags = 0; - bhv_vnode_t *vp; - xfs_prid_t projid=0, iprojid=0; - int mandlock_before, mandlock_after; struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int file_owner; int need_iolock = 1; - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + xfs_itrace_entry(ip); - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); - /* - * Cannot set certain attributes. - */ - mask = vap->va_mask; - if (mask & XFS_AT_NOSET) { - return XFS_ERROR(EINVAL); - } - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - /* - * Timestamps do not need to be logged and hence do not - * need to be done within a transaction. - */ - if (mask & XFS_AT_UPDTIMES) { - ASSERT((mask & ~XFS_AT_UPDTIMES) == 0); - timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) | - ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) | - ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0); - xfs_ichgtime(ip, timeflags); - return 0; - } - olddquot1 = olddquot2 = NULL; udqp = gdqp = NULL; @@ -312,28 +115,22 @@ xfs_setattr( * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ - if (XFS_IS_QUOTA_ON(mp) && - (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) { + if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { uint qflags = 0; - if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = vap->va_uid; + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { + uid = iattr->ia_uid; qflags |= XFS_QMOPT_UQUOTA; } else { uid = ip->i_d.di_uid; } - if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = vap->va_gid; + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { + gid = iattr->ia_gid; qflags |= XFS_QMOPT_GQUOTA; } else { gid = ip->i_d.di_gid; } - if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) { - projid = vap->va_projid; - qflags |= XFS_QMOPT_PQUOTA; - } else { - projid = ip->i_d.di_projid; - } + /* * We take a reference when we initialize udqp and gdqp, * so it is important that we never blindly double trip on @@ -341,8 +138,8 @@ xfs_setattr( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags, - &udqp, &gdqp); + code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid, + qflags, &udqp, &gdqp); if (code) return code; } @@ -353,10 +150,10 @@ xfs_setattr( */ tp = NULL; lock_flags = XFS_ILOCK_EXCL; - if (flags & ATTR_NOLOCK) + if (flags & XFS_ATTR_NOLOCK) need_iolock = 0; - if (!(mask & XFS_AT_SIZE)) { - if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) || + if (!(mask & ATTR_SIZE)) { + if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || (mp->m_flags & XFS_MOUNT_WSYNC)) { tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); commit_flags = 0; @@ -368,11 +165,11 @@ xfs_setattr( } } } else { - if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) && - !(flags & ATTR_DMI)) { + if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && + !(flags & XFS_ATTR_DMI)) { int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; - code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp, - vap->va_size, 0, dmflags, NULL); + code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip, + iattr->ia_size, 0, dmflags, NULL); if (code) { lock_flags = 0; goto error_return; @@ -385,16 +182,14 @@ xfs_setattr( xfs_ilock(ip, lock_flags); /* boolean: are we the file owner? */ - file_owner = (current_fsuid(credp) == ip->i_d.di_uid); + file_owner = (current_fsuid() == ip->i_d.di_uid); /* * Change various properties of a file. * Only the owner or users with CAP_FOWNER * capability may do these things. */ - if (mask & - (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID| - XFS_AT_GID|XFS_AT_PROJID)) { + if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) { /* * CAP_FOWNER overrides the following restrictions: * @@ -418,21 +213,21 @@ xfs_setattr( * IDs of the calling process shall match the group owner of * the file when setting the set-group-ID bit on that file */ - if (mask & XFS_AT_MODE) { + if (mask & ATTR_MODE) { mode_t m = 0; - if ((vap->va_mode & S_ISUID) && !file_owner) + if ((iattr->ia_mode & S_ISUID) && !file_owner) m |= S_ISUID; - if ((vap->va_mode & S_ISGID) && + if ((iattr->ia_mode & S_ISGID) && !in_group_p((gid_t)ip->i_d.di_gid)) m |= S_ISGID; #if 0 /* Linux allows this, Irix doesn't. */ - if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp)) + if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode)) m |= S_ISVTX; #endif if (m && !capable(CAP_FSETID)) - vap->va_mode &= ~m; + iattr->ia_mode &= ~m; } } @@ -443,7 +238,7 @@ xfs_setattr( * and can change the group id only to a group of which he * or she is a member. */ - if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { + if (mask & (ATTR_UID|ATTR_GID)) { /* * These IDs could have changed since we last looked at them. * But, we're assured that if the ownership did change @@ -451,12 +246,9 @@ xfs_setattr( * would have changed also. */ iuid = ip->i_d.di_uid; - iprojid = ip->i_d.di_projid; igid = ip->i_d.di_gid; - gid = (mask & XFS_AT_GID) ? vap->va_gid : igid; - uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid; - projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid : - iprojid; + gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; + uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; /* * CAP_CHOWN overrides the following restrictions: @@ -476,11 +268,10 @@ xfs_setattr( goto error_return; } /* - * Do a quota reservation only if uid/projid/gid is actually + * Do a quota reservation only if uid/gid is actually * going to change. */ if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || - (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) || (XFS_IS_GQUOTA_ON(mp) && igid != gid)) { ASSERT(tp); code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, @@ -494,22 +285,22 @@ xfs_setattr( /* * Truncate file. Must have write permission and not be a directory. */ - if (mask & XFS_AT_SIZE) { + if (mask & ATTR_SIZE) { /* Short circuit the truncate case for zero length files */ - if ((vap->va_size == 0) && - (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) { + if (iattr->ia_size == 0 && + ip->i_size == 0 && ip->i_d.di_nextents == 0) { xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; - if (mask & XFS_AT_CTIME) + if (mask & ATTR_CTIME) xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); code = 0; goto error_return; } - if (VN_ISDIR(vp)) { + if (S_ISDIR(ip->i_d.di_mode)) { code = XFS_ERROR(EISDIR); goto error_return; - } else if (!VN_ISREG(vp)) { + } else if (!S_ISREG(ip->i_d.di_mode)) { code = XFS_ERROR(EINVAL); goto error_return; } @@ -523,9 +314,9 @@ xfs_setattr( /* * Change file access or modified times. */ - if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { + if (mask & (ATTR_ATIME|ATTR_MTIME)) { if (!file_owner) { - if ((flags & ATTR_UTIME) && + if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) && !capable(CAP_FOWNER)) { code = XFS_ERROR(EPERM); goto error_return; @@ -534,95 +325,51 @@ xfs_setattr( } /* - * Change extent size or realtime flag. - */ - if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { - /* - * Can't change extent size if any extents are allocated. - */ - if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) && - ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != - vap->va_extsize) ) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ - goto error_return; - } - - /* - * Can't change realtime flag if any extents are allocated. - */ - if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - (mask & XFS_AT_XFLAGS) && - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != - (vap->va_xflags & XFS_XFLAG_REALTIME)) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ - goto error_return; - } - /* - * Extent size must be a multiple of the appropriate block - * size, if set at all. - */ - if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) { - xfs_extlen_t size; - - if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) || - ((mask & XFS_AT_XFLAGS) && - (vap->va_xflags & XFS_XFLAG_REALTIME))) { - size = mp->m_sb.sb_rextsize << - mp->m_sb.sb_blocklog; - } else { - size = mp->m_sb.sb_blocksize; - } - if (vap->va_extsize % size) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - } - /* - * If realtime flag is set then must have realtime data. - */ - if ((mask & XFS_AT_XFLAGS) && - (vap->va_xflags & XFS_XFLAG_REALTIME)) { - if ((mp->m_sb.sb_rblocks == 0) || - (mp->m_sb.sb_rextsize == 0) || - (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - } - - /* - * Can't modify an immutable/append-only file unless - * we have appropriate permission. - */ - if ((mask & XFS_AT_XFLAGS) && - (ip->i_d.di_flags & - (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || - (vap->va_xflags & - (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && - !capable(CAP_LINUX_IMMUTABLE)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - } - - /* * Now we can make the changes. Before we join the inode - * to the transaction, if XFS_AT_SIZE is set then take care of + * to the transaction, if ATTR_SIZE is set then take care of * the part of the truncation that must be done without the * inode lock. This needs to be done before joining the inode * to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ - if (mask & XFS_AT_SIZE) { + if (mask & ATTR_SIZE) { code = 0; - if ((vap->va_size > ip->i_d.di_size) && - (flags & ATTR_NOSIZETOK) == 0) { - code = xfs_igrow_start(ip, vap->va_size, credp); + if (iattr->ia_size > ip->i_size) { + /* + * Do the first part of growing a file: zero any data + * in the last block that is beyond the old EOF. We + * need to do this before the inode is joined to the + * transaction to modify the i_size. + */ + code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); } xfs_iunlock(ip, XFS_ILOCK_EXCL); - vn_iowait(vp); /* wait for the completion of any pending DIOs */ + + /* + * We are going to log the inode size change in this + * transaction so any previous writes that are beyond the on + * disk EOF and the new EOF that have not been written out need + * to be written here. If we do not write the data out, we + * expose ourselves to the null files problem. + * + * Only flush from the on disk size to the smaller of the in + * memory file size or the new size as that's the range we + * really care about here and prevents waiting for other data + * not within the range we care about here. + */ + if (!code && + ip->i_size != ip->i_d.di_size && + iattr->ia_size > ip->i_d.di_size) { + code = xfs_flush_pages(ip, + ip->i_d.di_size, iattr->ia_size, + XFS_B_ASYNC, FI_NONE); + } + + /* wait for all I/O to complete */ + vn_iowait(ip); + if (!code) - code = xfs_itruncate_data(ip, vap->va_size); + code = xfs_itruncate_data(ip, iattr->ia_size); if (code) { ASSERT(tp == NULL); lock_flags &= ~XFS_ILOCK_EXCL; @@ -648,25 +395,33 @@ xfs_setattr( xfs_trans_ihold(tp, ip); } - /* determine whether mandatory locking mode changes */ - mandlock_before = MANDLOCK(vp, ip->i_d.di_mode); - /* * Truncate file. Must have write permission and not be a directory. */ - if (mask & XFS_AT_SIZE) { - if (vap->va_size > ip->i_d.di_size) { - xfs_igrow_finish(tp, ip, vap->va_size, - !(flags & ATTR_DMI)); - } else if ((vap->va_size <= ip->i_d.di_size) || - ((vap->va_size == 0) && ip->i_d.di_nextents)) { + if (mask & ATTR_SIZE) { + /* + * Only change the c/mtime if we are changing the size + * or we are explicitly asked to change it. This handles + * the semantic difference between truncate() and ftruncate() + * as implemented in the VFS. + */ + if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) + timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (iattr->ia_size > ip->i_size) { + ip->i_d.di_size = iattr->ia_size; + ip->i_size = iattr->ia_size; + if (!(flags & XFS_ATTR_DMI)) + xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } else if (iattr->ia_size <= ip->i_size || + (iattr->ia_size == 0 && ip->i_d.di_nextents)) { /* * signal a sync transaction unless * we're truncating an already unlinked * file on a wsync filesystem */ - code = xfs_itruncate_finish(&tp, ip, - (xfs_fsize_t)vap->va_size, + code = xfs_itruncate_finish(&tp, ip, iattr->ia_size, XFS_DATA_FORK, ((ip->i_d.di_nlink != 0 || !(mp->m_flags & XFS_MOUNT_WSYNC)) @@ -681,20 +436,19 @@ xfs_setattr( * vnode and flush it when the file is closed, and * do not wait the usual (long) time for writeout. */ - VTRUNCATE(vp); + xfs_iflags_set(ip, XFS_ITRUNCATED); } - /* - * Have to do this even if the file's size doesn't change. - */ - timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; } /* * Change file access modes. */ - if (mask & XFS_AT_MODE) { + if (mask & ATTR_MODE) { ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= vap->va_mode & ~S_IFMT; + ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= iattr->ia_mode & ~S_IFMT; xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); timeflags |= XFS_ICHGTIME_CHG; @@ -707,7 +461,7 @@ xfs_setattr( * and can change the group id only to a group of which he * or she is a member. */ - if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { + if (mask & (ATTR_UID|ATTR_GID)) { /* * CAP_FSETID overrides the following restrictions: * @@ -725,39 +479,24 @@ xfs_setattr( */ if (iuid != uid) { if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & XFS_AT_UID); + ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip, &ip->i_udquot, udqp); } ip->i_d.di_uid = uid; + inode->i_uid = uid; } if (igid != gid) { if (XFS_IS_GQUOTA_ON(mp)) { ASSERT(!XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & XFS_AT_GID); + ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, &ip->i_gdquot, gdqp); } ip->i_d.di_gid = gid; - } - if (iprojid != projid) { - if (XFS_IS_PQUOTA_ON(mp)) { - ASSERT(!XFS_IS_GQUOTA_ON(mp)); - ASSERT(mask & XFS_AT_PROJID); - ASSERT(gdqp); - olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, - &ip->i_gdquot, gdqp); - } - ip->i_d.di_projid = projid; - /* - * We may have to rev the inode as well as - * the superblock version number since projids didn't - * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. - */ - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) - xfs_bump_ino_vers2(tp, ip); + inode->i_gid = gid; } xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); @@ -768,84 +507,33 @@ xfs_setattr( /* * Change file access or modified times. */ - if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { - if (mask & XFS_AT_ATIME) { - ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec; - ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec; + if (mask & (ATTR_ATIME|ATTR_MTIME)) { + if (mask & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; ip->i_update_core = 1; - timeflags &= ~XFS_ICHGTIME_ACC; } - if (mask & XFS_AT_MTIME) { - ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec; + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; timeflags &= ~XFS_ICHGTIME_MOD; timeflags |= XFS_ICHGTIME_CHG; } - if (tp && (flags & ATTR_UTIME)) + if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET))) xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); } /* - * Change XFS-added attributes. - */ - if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { - if (mask & XFS_AT_EXTSIZE) { - /* - * Converting bytes to fs blocks. - */ - ip->i_d.di_extsize = vap->va_extsize >> - mp->m_sb.sb_blocklog; - } - if (mask & XFS_AT_XFLAGS) { - uint di_flags; - - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); - if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) - di_flags |= XFS_DIFLAG_IMMUTABLE; - if (vap->va_xflags & XFS_XFLAG_APPEND) - di_flags |= XFS_DIFLAG_APPEND; - if (vap->va_xflags & XFS_XFLAG_SYNC) - di_flags |= XFS_DIFLAG_SYNC; - if (vap->va_xflags & XFS_XFLAG_NOATIME) - di_flags |= XFS_DIFLAG_NOATIME; - if (vap->va_xflags & XFS_XFLAG_NODUMP) - di_flags |= XFS_DIFLAG_NODUMP; - if (vap->va_xflags & XFS_XFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - if (vap->va_xflags & XFS_XFLAG_NODEFRAG) - di_flags |= XFS_DIFLAG_NODEFRAG; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { - if (vap->va_xflags & XFS_XFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT) - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { - if (vap->va_xflags & XFS_XFLAG_REALTIME) { - di_flags |= XFS_DIFLAG_REALTIME; - ip->i_iocore.io_flags |= XFS_IOCORE_RT; - } else { - ip->i_iocore.io_flags &= ~XFS_IOCORE_RT; - } - if (vap->va_xflags & XFS_XFLAG_EXTSIZE) - di_flags |= XFS_DIFLAG_EXTSIZE; - } - ip->i_d.di_flags = di_flags; - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; - } - - /* - * Change file inode change time only if XFS_AT_CTIME set + * Change file inode change time only if ATTR_CTIME set * AND we have been called by a DMI function. */ - if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) { - ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec; + if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; timeflags &= ~XFS_ICHGTIME_CHG; } @@ -854,7 +542,7 @@ xfs_setattr( * Send out timestamp changes that need to be set to the * current time. Not done when called by a DMI function. */ - if (timeflags && !(flags & ATTR_DMI)) + if (timeflags && !(flags & XFS_ATTR_DMI)) xfs_ichgtime(ip, timeflags); XFS_STATS_INC(xs_ig_attrchg); @@ -874,18 +562,7 @@ xfs_setattr( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - code = xfs_trans_commit(tp, commit_flags, NULL); - } - - /* - * If the (regular) file's mandatory locking mode changed, then - * notify the vnode. We do this under the inode lock to prevent - * racing calls to vop_vnode_change. - */ - mandlock_after = MANDLOCK(vp, ip->i_d.di_mode); - if (mandlock_before != mandlock_after) { - bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING, - mandlock_after); + code = xfs_trans_commit(tp, commit_flags); } xfs_iunlock(ip, lock_flags); @@ -902,9 +579,9 @@ xfs_setattr( return code; } - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) && - !(flags & ATTR_DMI)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL, + if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) && + !(flags & XFS_ATTR_DMI)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, AT_DELAY_FLAG(flags)); } @@ -925,31 +602,6 @@ xfs_setattr( return code; } - -/* - * xfs_access - * Null conversion from vnode mode bits to inode mode bits, as in efs. - */ -STATIC int -xfs_access( - bhv_desc_t *bdp, - int mode, - cred_t *credp) -{ - xfs_inode_t *ip; - int error; - - vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__, - (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_iaccess(ip, mode, credp); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - - /* * The maximum pathlen is 1024 bytes. Since the minimum file system * blocksize is 512 bytes, we can get a max of 2 extents back from @@ -957,227 +609,180 @@ xfs_access( */ #define SYMLINK_MAPS 2 -/* - * xfs_readlink - * - */ STATIC int -xfs_readlink( - bhv_desc_t *bdp, - uio_t *uiop, - int ioflags, - cred_t *credp) +xfs_readlink_bmap( + xfs_inode_t *ip, + char *link) { - xfs_inode_t *ip; - int count; - xfs_off_t offset; - int pathlen; - bhv_vnode_t *vp; - int error = 0; - xfs_mount_t *mp; - int nmaps; + xfs_mount_t *mp = ip->i_mount; + int pathlen = ip->i_d.di_size; + int nmaps = SYMLINK_MAPS; xfs_bmbt_irec_t mval[SYMLINK_MAPS]; xfs_daddr_t d; int byte_cnt; int n; xfs_buf_t *bp; + int error = 0; - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; + error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, + mval, &nmaps, NULL, NULL); + if (error) + goto out; - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - xfs_ilock(ip, XFS_ILOCK_SHARED); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + error = XFS_BUF_GETERROR(bp); + if (error) { + xfs_ioerror_alert("xfs_readlink", + ip->i_mount, bp, XFS_BUF_ADDR(bp)); + xfs_buf_relse(bp); + goto out; + } + if (pathlen < byte_cnt) + byte_cnt = pathlen; + pathlen -= byte_cnt; - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); + memcpy(link, XFS_BUF_PTR(bp), byte_cnt); + xfs_buf_relse(bp); + } - offset = uiop->uio_offset; - count = uiop->uio_resid; + link[ip->i_d.di_size] = '\0'; + error = 0; - if (offset < 0) { - error = XFS_ERROR(EINVAL); - goto error_return; - } - if (count <= 0) { - error = 0; - goto error_return; - } + out: + return error; +} - /* - * See if the symlink is stored inline. - */ - pathlen = (int)ip->i_d.di_size; +int +xfs_readlink( + xfs_inode_t *ip, + char *link) +{ + xfs_mount_t *mp = ip->i_mount; + int pathlen; + int error = 0; - if (ip->i_df.if_flags & XFS_IFINLINE) { - error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop); - } - else { - /* - * Symlink not inline. Call bmap to get it in. - */ - nmaps = SYMLINK_MAPS; + xfs_itrace_entry(ip); - error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), - 0, NULL, 0, mval, &nmaps, NULL, NULL); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); - if (error) { - goto error_return; - } + xfs_ilock(ip, XFS_ILOCK_SHARED); - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - error = XFS_BUF_GETERROR(bp); - if (error) { - xfs_ioerror_alert("xfs_readlink", - ip->i_mount, bp, XFS_BUF_ADDR(bp)); - xfs_buf_relse(bp); - goto error_return; - } - if (pathlen < byte_cnt) - byte_cnt = pathlen; - pathlen -= byte_cnt; + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); + ASSERT(ip->i_d.di_size <= MAXPATHLEN); - error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop); - xfs_buf_relse (bp); - } + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); } -error_return: + out: xfs_iunlock(ip, XFS_ILOCK_SHARED); return error; } - /* * xfs_fsync * - * This is called to sync the inode and its data out to disk. - * We need to hold the I/O lock while flushing the data, and - * the inode lock while flushing the inode. The inode lock CANNOT - * be held while flushing the data, so acquire after we're done - * with that. + * This is called to sync the inode and its data out to disk. We need to hold + * the I/O lock while flushing the data, and the inode lock while flushing the + * inode. The inode lock CANNOT be held while flushing the data, so acquire + * after we're done with that. */ -STATIC int +int xfs_fsync( - bhv_desc_t *bdp, - int flag, - cred_t *credp, - xfs_off_t start, - xfs_off_t stop) + xfs_inode_t *ip) { - xfs_inode_t *ip; xfs_trans_t *tp; int error; int log_flushed = 0, changed = 1; - vn_trace_entry(BHV_TO_VNODE(bdp), - __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - - ASSERT(start >= 0 && stop >= -1); + xfs_itrace_entry(ip); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return XFS_ERROR(EIO); + /* capture size updates in I/O completion before writing the inode. */ + error = filemap_fdatawait(VFS_I(ip)->i_mapping); + if (error) + return XFS_ERROR(error); + /* - * We always need to make sure that the required inode state - * is safe on disk. The vnode might be clean but because - * of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional - * changes to the inode core that have to go to disk. + * We always need to make sure that the required inode state is safe on + * disk. The vnode might be clean but we still might need to force the + * log because of committed transactions that haven't hit the disk yet. + * Likewise, there could be unflushed non-transactional changes to the + * inode core that have to go to disk and this requires us to issue + * a synchronous transaction to capture these changes correctly. * - * The following code depends on one assumption: that - * any transaction that changes an inode logs the core - * because it has to change some field in the inode core - * (typically nextents or nblocks). That assumption - * implies that any transactions against an inode will - * catch any non-transactional updates. If inode-altering - * transactions exist that violate this assumption, the - * code breaks. Right now, it figures that if the involved - * update_* field is clear and the inode is unpinned, the - * inode is clean. Either it's been flushed or it's been - * committed and the commit has hit the disk unpinning the inode. - * (Note that xfs_inode_item_format() called at commit clears - * the update_* fields.) + * This code relies on the assumption that if the update_* fields + * of the inode are clear and the inode is unpinned then it is clean + * and no action is required. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - /* If we are flushing data then we care about update_size - * being set, otherwise we care about update_core - */ - if ((flag & FSYNC_DATA) ? - (ip->i_update_size == 0) : - (ip->i_update_core == 0)) { + if (!(ip->i_update_size || ip->i_update_core)) { /* - * Timestamps/size haven't changed since last inode - * flush or inode transaction commit. That means - * either nothing got written or a transaction - * committed which caught the updates. If the - * latter happened and the transaction hasn't - * hit the disk yet, the inode will be still - * be pinned. If it is, force the log. + * Timestamps/size haven't changed since last inode flush or + * inode transaction commit. That means either nothing got + * written or a transaction committed which caught the updates. + * If the latter happened and the transaction hasn't hit the + * disk yet, the inode will be still be pinned. If it is, + * force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { - _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, - XFS_LOG_FORCE | - ((flag & FSYNC_WAIT) - ? XFS_LOG_SYNC : 0), + error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC, &log_flushed); } else { /* - * If the inode is not pinned and nothing - * has changed we don't need to flush the - * cache. + * If the inode is not pinned and nothing has changed + * we don't need to flush the cache. */ changed = 0; } - error = 0; } else { /* - * Kick off a transaction to log the inode - * core to get the updates. Make it - * sync if FSYNC_WAIT is passed in (which - * is done by everybody but specfs). The - * sync transaction will also force the log. + * Kick off a transaction to log the inode core to get the + * updates. The sync transaction will also force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); - if ((error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), - 0, 0, 0))) { + error = xfs_trans_reserve(tp, 0, + XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); /* - * Note - it's possible that we might have pushed - * ourselves out of the way during trans_reserve - * which would flush the inode. But there's no - * guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer - * could be pinned anyway if it's part of an - * inode in another recent transaction. So we - * play it safe and fire off the transaction anyway. + * Note - it's possible that we might have pushed ourselves out + * of the way during trans_reserve which would flush the inode. + * But there's no guarantee that the inode buffer has actually + * gone out yet (it's delwri). Plus the buffer could be pinned + * anyway if it's part of an inode in another recent + * transaction. So we play it safe and fire off the + * transaction anyway. */ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flag & FSYNC_WAIT) - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, NULL, &log_flushed); + xfs_trans_set_sync(tp); + error = _xfs_trans_commit(tp, 0, &log_flushed); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -1194,7 +799,7 @@ xfs_fsync( * If this inode is on the RT dev we need to flush that * cache as well. */ - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) + if (XFS_IS_REALTIME_INODE(ip)) xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); } @@ -1202,13 +807,15 @@ xfs_fsync( } /* - * This is called by xfs_inactive to free any blocks beyond eof, - * when the link count isn't zero. + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. */ -STATIC int -xfs_inactive_free_eofblocks( +int +xfs_free_eofblocks( xfs_mount_t *mp, - xfs_inode_t *ip) + xfs_inode_t *ip, + int flags) { xfs_trans_t *tp; int error; @@ -1217,12 +824,13 @@ xfs_inactive_free_eofblocks( xfs_filblks_t map_len; int nimaps; xfs_bmbt_irec_t imap; + int use_iolock = (flags & XFS_FREE_EOF_LOCK); /* * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. */ - end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size)); + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); map_len = last_fsb - end_fsb; if (map_len <= 0) @@ -1230,7 +838,7 @@ xfs_inactive_free_eofblocks( nimaps = 1; xfs_ilock(ip, XFS_ILOCK_SHARED); - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0, + error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, NULL, 0, &imap, &nimaps, NULL, NULL); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -1257,9 +865,16 @@ xfs_inactive_free_eofblocks( * cache and we can't * do that within a transaction. */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, - ip->i_d.di_size); + if (use_iolock) + xfs_ilock(ip, XFS_IOLOCK_EXCL); + error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, + ip->i_size); + if (error) { + xfs_trans_cancel(tp, 0); + if (use_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), @@ -1279,7 +894,7 @@ xfs_inactive_free_eofblocks( xfs_trans_ihold(tp, ip); error = xfs_itruncate_finish(&tp, ip, - ip->i_d.di_size, + ip->i_size, XFS_DATA_FORK, 0); /* @@ -1292,10 +907,10 @@ xfs_inactive_free_eofblocks( XFS_TRANS_ABORT)); } else { error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES, - NULL); + XFS_TRANS_RELEASE_LOG_RES); } - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL) + : XFS_ILOCK_EXCL)); } return error; } @@ -1381,7 +996,7 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed))) + if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) goto error1; /* * The transaction must have been committed, since there were @@ -1407,7 +1022,7 @@ xfs_inactive_symlink_rmt( * we need to unlock the inode since the new transaction doesn't * have the inode attached. */ - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); tp = ntp; if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); @@ -1500,32 +1115,26 @@ xfs_inactive_attrs( int error; xfs_mount_t *mp; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); tp = *tpp; mp = ip->i_mount; ASSERT(ip->i_d.di_forkoff != 0); - xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto error_unlock; error = xfs_attr_inactive(ip); - if (error) { - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; /* goto out */ - } + if (error) + goto error_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_INACTIVE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } + if (error) + goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); @@ -1536,46 +1145,68 @@ xfs_inactive_attrs( *tpp = tp; return 0; + +error_cancel: + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); +error_unlock: + *tpp = NULL; + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; } -STATIC int +int xfs_release( - bhv_desc_t *bdp) + xfs_inode_t *ip) { - xfs_inode_t *ip; - bhv_vnode_t *vp; - xfs_mount_t *mp; + xfs_mount_t *mp = ip->i_mount; int error; - vp = BHV_TO_VNODE(bdp); - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) return 0; /* If this is a read-only mount, don't do this (would generate I/O) */ - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; -#ifdef HAVE_REFCACHE - /* If we are in the NFS reference cache then don't do this now */ - if (ip->i_refcache) - return 0; -#endif + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we are using filestreams, and we have an unlinked + * file that we are processing the last close on, then nothing + * will be able to reopen and write to this file. Purge this + * inode from the filestreams cache so that it doesn't delay + * teardown of the inode. + */ + if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) + xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); + } if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { - if ((error = xfs_inactive_free_eofblocks(mp, ip))) + error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + if (error) return error; - /* Update linux inode block count after free above */ - vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, - ip->i_d.di_nblocks + ip->i_delayed_blks); } } @@ -1590,13 +1221,10 @@ xfs_release( * now be truncated. Also, we clear all of the read-ahead state * kept for the inode here since the file is now closed. */ -STATIC int +int xfs_inactive( - bhv_desc_t *bdp, - cred_t *credp) + xfs_inode_t *ip) { - xfs_inode_t *ip; - bhv_vnode_t *vp; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; int committed; @@ -1605,16 +1233,13 @@ xfs_inactive( int error; int truncate; - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); + xfs_itrace_entry(ip); /* * If the inode is already free, then there can be nothing * to clean up here. */ - if (ip->i_d.di_mode == 0 || VN_BAD(vp)) { + if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) { ASSERT(ip->i_df.if_real_bytes == 0); ASSERT(ip->i_df.if_broot_bytes == 0); return VN_INACTIVE_CACHE; @@ -1627,36 +1252,32 @@ xfs_inactive( * only one with a reference to the inode. */ truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) || - (ip->i_delayed_blks > 0)) && + ((ip->i_d.di_size != 0) || (ip->i_size != 0) || + (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); mp = ip->i_mount; - if (ip->i_d.di_nlink == 0 && - DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) { - (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL); - } + if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) + XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL); error = 0; /* If this is a read-only mount, don't do this (would generate I/O) */ - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) goto out; if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || (ip->i_delayed_blks != 0)))) { - if ((error = xfs_inactive_free_eofblocks(mp, ip))) + error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + if (error) return VN_INACTIVE_CACHE; - /* Update linux inode block count after free above */ - vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, - ip->i_d.di_nblocks + ip->i_delayed_blks); } goto out; } @@ -1676,7 +1297,12 @@ xfs_inactive( */ xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); + error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return VN_INACTIVE_CACHE; + } error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), @@ -1787,12 +1413,18 @@ xfs_inactive( XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1); /* - * Just ignore errors at this point. There is - * nothing we can do except to try to keep going. + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. */ - (void) xfs_bmap_finish(&tp, &free_list, first_block, - &committed); - (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " + "xfs_bmap_finish() returned error %d", error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " + "xfs_trans_commit() returned error %d", error); } /* * Release the dquots held by inode, if any. @@ -1805,90 +1437,81 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - /* - * xfs_lookup + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. */ -STATIC int +int xfs_lookup( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vnode_t **vpp, - int flags, - bhv_vnode_t *rdir, - cred_t *credp) + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) { - xfs_inode_t *dp, *ip; - xfs_ino_t e_inum; + xfs_ino_t inum; int error; uint lock_mode; - bhv_vnode_t *dir_vp; - - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - dp = XFS_BHVTOI(dir_bdp); + xfs_itrace_entry(dp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip); - if (!error) { - *vpp = XFS_ITOV(ip); - ITRACE(ip); - } + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); xfs_iunlock_map_shared(dp, lock_mode); + + if (error) + goto out; + + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0); + if (error) + goto out_free_name; + + xfs_itrace_ref(*ipp); + return 0; + +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: + *ipp = NULL; return error; } - -/* - * xfs_create (create a new file). - */ -STATIC int +int xfs_create( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vattr_t *vap, - bhv_vnode_t **vpp, + xfs_inode_t *dp, + struct xfs_name *name, + mode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp, cred_t *credp) { - char *name = VNAME(dentry); - bhv_vnode_t *dir_vp; - xfs_inode_t *dp, *ip; - bhv_vnode_t *vp = NULL; + xfs_mount_t *mp = dp->i_mount; + xfs_inode_t *ip; xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_dev_t rdev; - int error; + int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t dp_joined_to_trans; + boolean_t unlock_dp_on_error = B_FALSE; int dm_event_sent = 0; uint cancel_flags; int committed; xfs_prid_t prid; struct xfs_dquot *udqp, *gdqp; uint resblks; - int dm_di_mode; - int namelen; - ASSERT(!*vpp); - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + ASSERT(!*ipp); + xfs_itrace_entry(dp); - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; - - dm_di_mode = vap->va_mode; - namelen = VNAMELEN(dentry); - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) { + if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dir_vp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, name, NULL, - dm_di_mode, 0, 0); + dp, DM_RIGHT_NULL, NULL, + DM_RIGHT_NULL, name->name, NULL, + mode, 0, 0); if (error) return error; @@ -1903,8 +1526,6 @@ xfs_create( udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; - else if (vap->va_mask & XFS_AT_PROJID) - prid = (xfs_prid_t)vap->va_projid; else prid = (xfs_prid_t)dfltprid; @@ -1912,17 +1533,16 @@ xfs_create( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, + current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; ip = NULL; - dp_joined_to_trans = B_FALSE; tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_CREATE_SPACE_RES(mp, namelen); + resblks = XFS_CREATE_SPACE_RES(mp, name->len); /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1938,11 +1558,11 @@ xfs_create( } if (error) { cancel_flags = 0; - dp = NULL; goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = B_TRUE; XFS_BMAP_INIT(&free_list, &first_block); @@ -1955,10 +1575,10 @@ xfs_create( if (error) goto error_return; - if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen))) + error = xfs_dir_canenter(tp, dp, name, resblks); + if (error) goto error_return; - rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0; - error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1, + error = xfs_dir_ialloc(&tp, dp, mode, 1, rdev, credp, prid, resblks > 0, &ip, &committed); if (error) { @@ -1966,27 +1586,27 @@ xfs_create( goto error_return; goto abort_return; } - ITRACE(ip); + xfs_itrace_ref(ip); /* * At this point, we've gotten a newly allocated inode. * It is locked (and joined to the transaction). */ - ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* - * Now we join the directory inode to the transaction. - * We do not do it earlier because xfs_dir_ialloc - * might commit the previous transaction (and release - * all the locks). + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. */ - - VN_HOLD(dir_vp); + IHOLD(dp); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - dp_joined_to_trans = B_TRUE; + unlock_dp_on_error = B_FALSE; - error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino, + error = xfs_dir_createname(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { @@ -2020,15 +1640,14 @@ xfs_create( * vnode to the caller, we bump the vnode ref count now. */ IHOLD(ip); - vp = XFS_ITOV(ip); - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); goto abort_rele; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) { IRELE(ip); tp = NULL; @@ -2038,25 +1657,18 @@ xfs_create( XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); - /* - * Propagate the fact that the vnode changed after the - * xfs_inode locks have been released. - */ - bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3); - - *vpp = vp; + *ipp = ip; /* Fallthrough to std_return with error = 0 */ std_return: - if ( (*vpp || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), - DM_EVENT_POSTCREATE)) { + if ((*ipp || (error != 0 && dm_event_sent != 0)) && + DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dir_vp, DM_RIGHT_NULL, - *vpp ? vp:NULL, - DM_RIGHT_NULL, name, NULL, - dm_di_mode, error, 0); + dp, DM_RIGHT_NULL, + *ipp ? ip : NULL, + DM_RIGHT_NULL, name->name, NULL, + mode, error, 0); } return error; @@ -2068,11 +1680,12 @@ std_return: if (tp != NULL) xfs_trans_cancel(tp, cancel_flags); - if (!dp_joined_to_trans && (dp != NULL)) - xfs_iunlock(dp, XFS_ILOCK_EXCL); XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + goto std_return; abort_rele: @@ -2092,119 +1705,28 @@ std_return: } #ifdef DEBUG -/* - * Some counters to see if (and how often) we are hitting some deadlock - * prevention code paths. - */ - -int xfs_rm_locks; -int xfs_rm_lock_delays; -int xfs_rm_attempts; +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; #endif /* - * The following routine will lock the inodes associated with the - * directory and the named entry in the directory. The locks are - * acquired in increasing inode number. - * - * If the entry is "..", then only the directory is locked. The - * vnode ref count will still include that from the .. entry in - * this case. - * - * There is a deadlock we need to worry about. If the locked directory is - * in the AIL, it might be blocking up the log. The next inode we lock - * could be already locked by another thread waiting for log space (e.g - * a permanent log reservation with a long running transaction (see - * xfs_itruncate_finish)). To solve this, we must check if the directory - * is in the ail and use lock_nowait. If we can't lock, we need to - * drop the inode lock on the directory and try again. xfs_iunlock will - * potentially push the tail if we were holding up the log. + * Bump the subclass so xfs_lock_inodes() acquires each lock with + * a different value */ -STATIC int -xfs_lock_dir_and_entry( - xfs_inode_t *dp, - bhv_vname_t *dentry, - xfs_inode_t *ip) /* inode of entry 'name' */ +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) { - int attempts; - xfs_ino_t e_inum; - xfs_inode_t *ips[2]; - xfs_log_item_t *lp; - -#ifdef DEBUG - xfs_rm_locks++; -#endif - attempts = 0; - -again: - xfs_ilock(dp, XFS_ILOCK_EXCL); - - e_inum = ip->i_ino; - - ITRACE(ip); - - /* - * We want to lock in increasing inum. Since we've already - * acquired the lock on the directory, we may need to release - * if if the inum of the entry turns out to be less. - */ - if (e_inum > dp->i_ino) { - /* - * We are already in the right order, so just - * lock on the inode of the entry. - * We need to use nowait if dp is in the AIL. - */ - - lp = (xfs_log_item_t *)dp->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - attempts++; -#ifdef DEBUG - xfs_rm_attempts++; -#endif - - /* - * Unlock dp and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_rm_lock_delays++; -#endif - } - goto again; - } - } else { - xfs_ilock(ip, XFS_ILOCK_EXCL); - } - } else if (e_inum < dp->i_ino) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; - ips[0] = ip; - ips[1] = dp; - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); - } - /* else e_inum == dp->i_ino */ - /* This can happen if we're asked to lock /x/.. - * the entry is "..", which is also the parent directory. - */ - - return 0; + return lock_mode; } -#ifdef DEBUG -int xfs_locked_n; -int xfs_small_retries; -int xfs_middle_retries; -int xfs_lots_retries; -int xfs_lock_delays; -#endif - /* * The following routine will lock n inodes in exclusive mode. * We assume the caller calls us with the inodes in i_ino order. @@ -2220,7 +1742,6 @@ void xfs_lock_inodes( xfs_inode_t **ips, int inodes, - int first_locked, uint lock_mode) { int attempts = 0, i, j, try_lock; @@ -2228,13 +1749,8 @@ xfs_lock_inodes( ASSERT(ips && (inodes >= 2)); /* we need at least two */ - if (first_locked) { - try_lock = 1; - i = 1; - } else { - try_lock = 0; - i = 0; - } + try_lock = 0; + i = 0; again: for (; i < inodes; i++) { @@ -2272,7 +1788,7 @@ again: * that is in the AIL. */ ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], lock_mode)) { + if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { attempts++; /* @@ -2307,7 +1823,7 @@ again: goto again; } } else { - xfs_ilock(ips[i], lock_mode); + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } } @@ -2322,96 +1838,102 @@ again: #endif } -#ifdef DEBUG -#define REMOVE_DEBUG_TRACE(x) {remove_which_error_return = (x);} -int remove_which_error_return = 0; -#else /* ! DEBUG */ -#define REMOVE_DEBUG_TRACE(x) -#endif /* ! DEBUG */ - - /* - * xfs_remove - * + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. */ -STATIC int +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) +{ + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; + + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + ASSERT(ip0->i_ino != ip1->i_ino); + + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; + } + + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + } +} + +int xfs_remove( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - cred_t *credp) + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) { - bhv_vnode_t *dir_vp; - char *name = VNAME(dentry); - xfs_inode_t *dp, *ip; + xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp = NULL; - xfs_mount_t *mp; + int is_dir = S_ISDIR(ip->i_d.di_mode); int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; int cancel_flags; int committed; - int dm_di_mode = 0; int link_zero; uint resblks; - int namelen; + uint log_count; - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; + xfs_itrace_entry(dp); + xfs_itrace_entry(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - namelen = VNAMELEN(dentry); - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp, - DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - name, NULL, 0, 0, 0); + if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, name->name, NULL, + ip->i_d.di_mode, 0, 0); if (error) return error; } - /* From this point on, return through std_return */ - ip = NULL; - - /* - * We need to get a reference to ip before we get our log - * reservation. The reason for this is that we cannot call - * xfs_iget for an inode for which we do not have a reference - * once we've acquired a log reservation. This is because the - * inode we are trying to get might be in xfs_inactive going - * for a log reservation. Since we'll have to wait for the - * inactive code to complete before returning from xfs_iget, - * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked reference - * to the inode before getting our log reservation. - */ - error = xfs_get_dir_entry(dentry, &ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); + error = XFS_QM_DQATTACH(mp, dp, 0); + if (error) goto std_return; - } - - dm_di_mode = ip->i_d.di_mode; - - vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); - ITRACE(ip); - - error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error && dp != ip) - error = XFS_QM_DQATTACH(mp, ip, 0); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - IRELE(ip); + error = XFS_QM_DQATTACH(mp, ip, 0); + if (error) goto std_return; - } - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + if (is_dir) { + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + log_count = XFS_DEFAULT_LOG_COUNT; + } else { + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + log_count = XFS_REMOVE_LOG_COUNT; + } cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* * We try to get the real space reservation first, * allowing for directory btree deletion(s) implying @@ -2423,199 +1945,170 @@ xfs_remove( */ resblks = XFS_REMOVE_SPACE_RES(mp); error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); + XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); + XFS_TRANS_PERM_LOG_RES, log_count); } if (error) { ASSERT(error != ENOSPC); - REMOVE_DEBUG_TRACE(__LINE__); - xfs_trans_cancel(tp, 0); - IRELE(ip); - return error; + cancel_flags = 0; + goto out_trans_cancel; } - error = xfs_lock_dir_and_entry(dp, dentry, ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - xfs_trans_cancel(tp, cancel_flags); - IRELE(ip); - goto std_return; - } + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); /* * At this point, we've gotten both the directory and the entry * inodes locked. */ + IHOLD(ip); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - if (dp != ip) { - /* - * Increment vnode ref count only in this case since - * there's an extra vnode reference in the case where - * dp == ip. - */ - IHOLD(dp); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - } + + IHOLD(dp); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* - * Entry must exist since we did a lookup in xfs_lock_dir_and_entry. + * If we're removing a directory perform some additional validation. */ + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + if (!xfs_dir_isempty(ip)) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + } + XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino, - &first_block, &free_list, 0); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); if (error) { ASSERT(error != ENOENT); - REMOVE_DEBUG_TRACE(__LINE__); - goto error1; + goto out_bmap_cancel; } xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + /* + * Bump the in memory generation count on the parent + * directory so that other can know that it has changed. + */ dp->i_gen++; xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - error = xfs_droplink(tp, ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto error1; + if (is_dir) { + /* + * Drop the link from ip's "..". + */ + error = xfs_droplink(tp, dp); + if (error) + goto out_bmap_cancel; + + /* + * Drop the link from dp to ip. + */ + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here for the i_gen update. For a directory this is + * done implicitly by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } - /* Determine if this is the last link while - * we are in the transaction. + /* + * Drop the "." link from ip to self. */ - link_zero = (ip)->i_d.di_nlink==0; + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; /* - * Take an extra ref on the inode so that it doesn't - * go to xfs_inactive() from within the commit. + * Determine if this is the last link while + * we are in the transaction. */ - IHOLD(ip); + link_zero = (ip->i_d.di_nlink == 0); /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto error_rele; - } + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { - IRELE(ip); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) goto std_return; - } - - /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - - vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); /* - * Let interposed file systems know about removed links. + * If we are using filestreams, kill the stream association. + * If the file is still open it may get a new one but that + * will get killed on last close in xfs_close() so we don't + * have to worry about that. */ - bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero); + if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); - IRELE(ip); + xfs_itrace_exit(ip); + xfs_itrace_exit(dp); -/* Fall through to std_return with error = 0 */ std_return: - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, - DM_EVENT_POSTREMOVE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dir_vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, error, 0); + if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { + XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, name->name, NULL, + ip->i_d.di_mode, error, 0); } - return error; - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - xfs_trans_cancel(tp, cancel_flags); - goto std_return; + return error; - error_rele: - /* - * In this case make sure to not release the inode until after - * the current transaction is aborted. Releasing it beforehand - * can cause us to go to xfs_inactive and start a recursive - * transaction which can easily deadlock with the current one. - */ + out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - - /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - - IRELE(ip); - goto std_return; } - -/* - * xfs_link - * - */ -STATIC int +int xfs_link( - bhv_desc_t *target_dir_bdp, - bhv_vnode_t *src_vp, - bhv_vname_t *dentry, - cred_t *credp) + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) { - xfs_inode_t *tdp, *sip; + xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_inode_t *ips[2]; int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; int cancel_flags; int committed; - bhv_vnode_t *target_dir_vp; int resblks; - char *target_name = VNAME(dentry); - int target_namelen; - target_dir_vp = BHV_TO_VNODE(target_dir_bdp); - vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address); - vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address); + xfs_itrace_entry(tdp); + xfs_itrace_entry(sip); - target_namelen = VNAMELEN(dentry); - ASSERT(!VN_ISDIR(src_vp)); + ASSERT(!S_ISDIR(sip->i_d.di_mode)); - sip = xfs_vtoi(src_vp); - tdp = XFS_BHVTOI(target_dir_bdp); - mp = tdp->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) { + if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK, - target_dir_vp, DM_RIGHT_NULL, - src_vp, DM_RIGHT_NULL, - target_name, NULL, 0, 0, 0); + tdp, DM_RIGHT_NULL, + sip, DM_RIGHT_NULL, + target_name->name, NULL, 0, 0, 0); if (error) return error; } @@ -2630,7 +2123,7 @@ xfs_link( tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_LINK_SPACE_RES(mp, target_namelen); + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); if (error == ENOSPC) { @@ -2643,23 +2136,15 @@ xfs_link( goto error_return; } - if (sip->i_ino < tdp->i_ino) { - ips[0] = sip; - ips[1] = tdp; - } else { - ips[0] = tdp; - ips[1] = sip; - } - - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); /* * Increment vnode ref counts since xfs_trans_commit & * xfs_trans_cancel will both unlock the inodes and * decrement the associated ref counts. */ - VN_HOLD(src_vp); - VN_HOLD(target_dir_vp); + IHOLD(sip); + IHOLD(tdp); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); @@ -2682,15 +2167,14 @@ xfs_link( goto error_return; } - if (resblks == 0 && - (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen))) + error = xfs_dir_canenter(tp, tdp, target_name, resblks); + if (error) goto error_return; XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_dir_createname(tp, tdp, target_name, target_namelen, - sip->i_ino, &first_block, &free_list, - resblks); + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); if (error) goto abort_return; xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2710,24 +2194,23 @@ xfs_link( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); goto abort_return; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) goto std_return; /* Fall through to std_return with error = 0. */ std_return: - if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip, - DM_EVENT_POSTLINK)) { + if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK, - target_dir_vp, DM_RIGHT_NULL, - src_vp, DM_RIGHT_NULL, - target_name, NULL, 0, error, 0); + tdp, DM_RIGHT_NULL, + sip, DM_RIGHT_NULL, + target_name->name, NULL, 0, error, 0); } return error; @@ -2741,57 +2224,39 @@ std_return: } -/* - * xfs_mkdir - * - */ -STATIC int +int xfs_mkdir( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vattr_t *vap, - bhv_vnode_t **vpp, + xfs_inode_t *dp, + struct xfs_name *dir_name, + mode_t mode, + xfs_inode_t **ipp, cred_t *credp) { - char *dir_name = VNAME(dentry); - xfs_inode_t *dp; + xfs_mount_t *mp = dp->i_mount; xfs_inode_t *cdp; /* inode of created dir */ - bhv_vnode_t *cvp; /* vnode of created dir */ xfs_trans_t *tp; - xfs_mount_t *mp; int cancel_flags; int error; int committed; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - bhv_vnode_t *dir_vp; - boolean_t dp_joined_to_trans; + boolean_t unlock_dp_on_error = B_FALSE; boolean_t created = B_FALSE; int dm_event_sent = 0; xfs_prid_t prid; struct xfs_dquot *udqp, *gdqp; uint resblks; - int dm_di_mode; - int dir_namelen; - - dir_vp = BHV_TO_VNODE(dir_bdp); - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - dir_namelen = VNAMELEN(dentry); - tp = NULL; - dp_joined_to_trans = B_FALSE; - dm_di_mode = vap->va_mode; - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) { + if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dir_vp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, dir_name, NULL, - dm_di_mode, 0, 0); + dp, DM_RIGHT_NULL, NULL, + DM_RIGHT_NULL, dir_name->name, NULL, + mode, 0, 0); if (error) return error; dm_event_sent = 1; @@ -2799,14 +2264,12 @@ xfs_mkdir( /* Return through std_return after this point. */ - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + xfs_itrace_entry(dp); mp = dp->i_mount; udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; - else if (vap->va_mask & XFS_AT_PROJID) - prid = (xfs_prid_t)vap->va_projid; else prid = (xfs_prid_t)dfltprid; @@ -2814,14 +2277,14 @@ xfs_mkdir( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, + current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen); + resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len); error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT); if (error == ENOSPC) { @@ -2832,11 +2295,11 @@ xfs_mkdir( } if (error) { cancel_flags = 0; - dp = NULL; goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = B_TRUE; /* * Check for directory link count overflow. @@ -2853,13 +2316,13 @@ xfs_mkdir( if (error) goto error_return; - if (resblks == 0 && - (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen))) + error = xfs_dir_canenter(tp, dp, dir_name, resblks); + if (error) goto error_return; /* * create the directory inode. */ - error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2, + error = xfs_dir_ialloc(&tp, dp, mode, 2, 0, credp, prid, resblks > 0, &cdp, NULL); if (error) { @@ -2867,23 +2330,25 @@ xfs_mkdir( goto error_return; goto abort_return; } - ITRACE(cdp); + xfs_itrace_ref(cdp); /* * Now we add the directory inode to the transaction. * We waited until now since xfs_dir_ialloc might start * a new transaction. Had we joined the transaction - * earlier, the locks might have gotten released. + * earlier, the locks might have gotten released. An error + * from here on will result in the transaction cancel + * unlocking dp so don't do it explicitly in the error path. */ - VN_HOLD(dir_vp); + IHOLD(dp); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - dp_joined_to_trans = B_TRUE; + unlock_dp_on_error = B_FALSE; XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != ENOSPC); goto error1; @@ -2906,11 +2371,9 @@ xfs_mkdir( if (error) goto error2; - cvp = XFS_ITOV(cdp); - created = B_TRUE; - *vpp = cvp; + *ipp = cdp; IHOLD(cdp); /* @@ -2927,13 +2390,13 @@ xfs_mkdir( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { IRELE(cdp); goto error2; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); if (error) { @@ -2944,15 +2407,14 @@ xfs_mkdir( * xfs_trans_commit. */ std_return: - if ( (created || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), - DM_EVENT_POSTCREATE)) { + if ((created || (error != 0 && dm_event_sent != 0)) && + DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dir_vp, DM_RIGHT_NULL, - created ? XFS_ITOV(cdp):NULL, + dp, DM_RIGHT_NULL, + created ? cdp : NULL, DM_RIGHT_NULL, - dir_name, NULL, - dm_di_mode, error, 0); + dir_name->name, NULL, + mode, error, 0); } return error; @@ -2966,308 +2428,29 @@ std_return: XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); - if (!dp_joined_to_trans && (dp != NULL)) { + if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - } - - goto std_return; -} - - -/* - * xfs_rmdir - * - */ -STATIC int -xfs_rmdir( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - cred_t *credp) -{ - char *name = VNAME(dentry); - xfs_inode_t *dp; - xfs_inode_t *cdp; /* child directory */ - xfs_trans_t *tp; - xfs_mount_t *mp; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - bhv_vnode_t *dir_vp; - int dm_di_mode = 0; - int last_cdp_link; - int namelen; - uint resblks; - - dir_vp = BHV_TO_VNODE(dir_bdp); - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; - - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount)) - return XFS_ERROR(EIO); - namelen = VNAMELEN(dentry); - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, - dir_vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name, NULL, 0, 0, 0); - if (error) - return XFS_ERROR(error); - } - - /* Return through std_return after this point. */ - - cdp = NULL; - - /* - * We need to get a reference to cdp before we get our log - * reservation. The reason for this is that we cannot call - * xfs_iget for an inode for which we do not have a reference - * once we've acquired a log reservation. This is because the - * inode we are trying to get might be in xfs_inactive going - * for a log reservation. Since we'll have to wait for the - * inactive code to complete before returning from xfs_iget, - * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked reference - * to the inode before getting our log reservation. - */ - error = xfs_get_dir_entry(dentry, &cdp); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - mp = dp->i_mount; - dm_di_mode = cdp->i_d.di_mode; - - /* - * Get the dquots for the inodes. - */ - error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error && dp != cdp) - error = XFS_QM_DQATTACH(mp, cdp, 0); - if (error) { - IRELE(cdp); - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. - */ - resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); - } - if (error) { - ASSERT(error != ENOSPC); - cancel_flags = 0; - IRELE(cdp); - goto error_return; - } - XFS_BMAP_INIT(&free_list, &first_block); - - /* - * Now lock the child directory inode and the parent directory - * inode in the proper order. This will take care of validating - * that the directory entry for the child directory inode has - * not changed while we were obtaining a log reservation. - */ - error = xfs_lock_dir_and_entry(dp, dentry, cdp); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - IRELE(cdp); - goto std_return; - } - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - if (dp != cdp) { - /* - * Only increment the parent directory vnode count if - * we didn't bump it in looking up cdp. The only time - * we don't bump it is when we're looking up ".". - */ - VN_HOLD(dir_vp); - } - - ITRACE(cdp); - xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL); - - ASSERT(cdp->i_d.di_nlink >= 2); - if (cdp->i_d.di_nlink != 2) { - error = XFS_ERROR(ENOTEMPTY); - goto error_return; - } - if (!xfs_dir_isempty(cdp)) { - error = XFS_ERROR(ENOTEMPTY); - goto error_return; - } - - error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error1; - - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Bump the in memory generation count on the parent - * directory so that other can know that it has changed. - */ - dp->i_gen++; - - /* - * Drop the link from cdp's "..". - */ - error = xfs_droplink(tp, dp); - if (error) { - goto error1; - } - - /* - * Drop the link from dp to cdp. - */ - error = xfs_droplink(tp, cdp); - if (error) { - goto error1; - } - - /* - * Drop the "." link from cdp to self. - */ - error = xfs_droplink(tp, cdp); - if (error) { - goto error1; - } - - /* Determine these before committing transaction */ - last_cdp_link = (cdp)->i_d.di_nlink==0; - - /* - * Take an extra ref on the child vnode so that it - * does not go to xfs_inactive() from within the commit. - */ - IHOLD(cdp); - - /* - * If this is a synchronous mount, make sure that the - * rmdir transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - IRELE(cdp); - goto std_return; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { - IRELE(cdp); - goto std_return; - } - - - /* - * Let interposed file systems know about removed links. - */ - bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link); - - IRELE(cdp); - /* Fall through to std_return with error = 0 or the errno - * from xfs_trans_commit. */ - std_return: - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dir_vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, - error, 0); - } - return error; - - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - - error_return: - xfs_trans_cancel(tp, cancel_flags); goto std_return; } - -/* - * Read dp's entries starting at uiop->uio_offset and translate them into - * bufsize bytes worth of struct dirents starting at bufbase. - */ -STATIC int -xfs_readdir( - bhv_desc_t *dir_bdp, - uio_t *uiop, - cred_t *credp, - int *eofp) -{ - xfs_inode_t *dp; - xfs_trans_t *tp = NULL; - int error = 0; - uint lock_mode; - - vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__, - (inst_t *)__return_address); - dp = XFS_BHVTOI(dir_bdp); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); - - lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_getdents(tp, dp, uiop, eofp); - xfs_iunlock_map_shared(dp, lock_mode); - return error; -} - - -STATIC int +int xfs_symlink( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vattr_t *vap, - char *target_path, - bhv_vnode_t **vpp, + xfs_inode_t *dp, + struct xfs_name *link_name, + const char *target_path, + mode_t mode, + xfs_inode_t **ipp, cred_t *credp) { + xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_inode_t *dp; xfs_inode_t *ip; int error; int pathlen; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t dp_joined_to_trans; - bhv_vnode_t *dir_vp; + boolean_t unlock_dp_on_error = B_FALSE; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -3275,69 +2458,35 @@ xfs_symlink( int nmaps; xfs_bmbt_irec_t mval[SYMLINK_MAPS]; xfs_daddr_t d; - char *cur_chunk; + const char *cur_chunk; int byte_cnt; int n; xfs_buf_t *bp; xfs_prid_t prid; struct xfs_dquot *udqp, *gdqp; uint resblks; - char *link_name = VNAME(dentry); - int link_namelen; - *vpp = NULL; - dir_vp = BHV_TO_VNODE(dir_bdp); - dp = XFS_BHVTOI(dir_bdp); - dp_joined_to_trans = B_FALSE; + *ipp = NULL; error = 0; ip = NULL; tp = NULL; - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - mp = dp->i_mount; + xfs_itrace_entry(dp); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - link_namelen = VNAMELEN(dentry); - /* * Check component lengths of the target path name. */ pathlen = strlen(target_path); if (pathlen >= MAXPATHLEN) /* total string too long */ return XFS_ERROR(ENAMETOOLONG); - if (pathlen >= MAXNAMELEN) { /* is any component too long? */ - int len, total; - char *path; - - for (total = 0, path = target_path; total < pathlen;) { - /* - * Skip any slashes. - */ - while(*path == '/') { - total++; - path++; - } - - /* - * Count up to the next slash or end of path. - * Error out if the component is bigger than MAXNAMELEN. - */ - for(len = 0; *path != '/' && total < pathlen;total++, path++) { - if (++len >= MAXNAMELEN) { - error = ENAMETOOLONG; - return error; - } - } - } - } - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp, + if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - link_name, target_path, 0, 0, 0); + link_name->name, target_path, 0, 0, 0); if (error) return error; } @@ -3347,8 +2496,6 @@ xfs_symlink( udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; - else if (vap->va_mask & XFS_AT_PROJID) - prid = (xfs_prid_t)vap->va_projid; else prid = (xfs_prid_t)dfltprid; @@ -3356,7 +2503,7 @@ xfs_symlink( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, + current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; @@ -3371,7 +2518,7 @@ xfs_symlink( fs_blocks = 0; else fs_blocks = XFS_B_TO_FSB(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); if (error == ENOSPC && fs_blocks == 0) { @@ -3381,11 +2528,11 @@ xfs_symlink( } if (error) { cancel_flags = 0; - dp = NULL; goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = B_TRUE; /* * Check whether the directory allows new symlinks or not. @@ -3405,8 +2552,8 @@ xfs_symlink( /* * Check for ability to enter directory entry, if no space reserved. */ - if (resblks == 0 && - (error = xfs_dir_canenter(tp, dp, link_name, link_namelen))) + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) goto error_return; /* * Initialize the bmap freelist prior to calling either @@ -3417,18 +2564,23 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT), + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, credp, prid, resblks > 0, &ip, NULL); if (error) { if (error == ENOSPC) goto error_return; goto error1; } - ITRACE(ip); + xfs_itrace_ref(ip); - VN_HOLD(dir_vp); + /* + * An error after we've joined dp to the transaction will result in the + * transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + IHOLD(dp); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - dp_joined_to_trans = B_TRUE; + unlock_dp_on_error = B_FALSE; /* * Also attach the dquot(s) to it, if applicable. @@ -3493,8 +2645,8 @@ xfs_symlink( /* * Create the directory entry for the symlink. */ - error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino, - &first_block, &free_list, resblks); + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); if (error) goto error1; xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3523,33 +2675,27 @@ xfs_symlink( */ IHOLD(ip); - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { goto error2; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); /* Fall through to std_return with error = 0 or errno from * xfs_trans_commit */ std_return: - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), - DM_EVENT_POSTSYMLINK)) { + if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK, - dir_vp, DM_RIGHT_NULL, - error ? NULL : XFS_ITOV(ip), - DM_RIGHT_NULL, link_name, target_path, - 0, error, 0); + dp, DM_RIGHT_NULL, + error ? NULL : ip, + DM_RIGHT_NULL, link_name->name, + target_path, 0, error, 0); } - if (!error) { - bhv_vnode_t *vp; - - ASSERT(ip); - vp = XFS_ITOV(ip); - *vpp = vp; - } + if (!error) + *ipp = ip; return error; error2: @@ -3562,124 +2708,20 @@ std_return: XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); - if (!dp_joined_to_trans && (dp != NULL)) { + if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - } goto std_return; } - -/* - * xfs_fid2 - * - * A fid routine that takes a pointer to a previously allocated - * fid structure (like xfs_fast_fid) but uses a 64 bit inode number. - */ -STATIC int -xfs_fid2( - bhv_desc_t *bdp, - fid_t *fidp) -{ - xfs_inode_t *ip; - xfs_fid2_t *xfid; - - vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__, - (inst_t *)__return_address); - ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t)); - - xfid = (xfs_fid2_t *)fidp; - ip = XFS_BHVTOI(bdp); - xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len); - xfid->fid_pad = 0; - /* - * use memcpy because the inode is a long long and there's no - * assurance that xfid->fid_ino is properly aligned. - */ - memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino)); - xfid->fid_gen = ip->i_d.di_gen; - - return 0; -} - - -/* - * xfs_rwlock - */ int -xfs_rwlock( - bhv_desc_t *bdp, - bhv_vrwlock_t locktype) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - if (VN_ISDIR(vp)) - return 1; - ip = XFS_BHVTOI(bdp); - if (locktype == VRWLOCK_WRITE) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - } else if (locktype == VRWLOCK_TRY_READ) { - return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED); - } else if (locktype == VRWLOCK_TRY_WRITE) { - return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL); - } else { - ASSERT((locktype == VRWLOCK_READ) || - (locktype == VRWLOCK_WRITE_DIRECT)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - return 1; -} - - -/* - * xfs_rwunlock - */ -void -xfs_rwunlock( - bhv_desc_t *bdp, - bhv_vrwlock_t locktype) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - if (VN_ISDIR(vp)) - return; - ip = XFS_BHVTOI(bdp); - if (locktype == VRWLOCK_WRITE) { - /* - * In the write case, we may have added a new entry to - * the reference cache. This might store a pointer to - * an inode to be released in this inode. If it is there, - * clear the pointer and release the inode after unlocking - * this one. - */ - xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL); - } else { - ASSERT((locktype == VRWLOCK_READ) || - (locktype == VRWLOCK_WRITE_DIRECT)); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - } - return; -} - -STATIC int xfs_inode_flush( - bhv_desc_t *bdp, + xfs_inode_t *ip, int flags) { - xfs_inode_t *ip; - xfs_mount_t *mp; - xfs_inode_log_item_t *iip; + xfs_mount_t *mp = ip->i_mount; int error = 0; - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - iip = ip->i_itemp; - if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -3687,29 +2729,9 @@ xfs_inode_flush( * Bypass inodes which have already been cleaned by * the inode flush clustering code inside xfs_iflush */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) + if (xfs_inode_clean(ip)) return 0; - if (flags & FLUSH_LOG) { - if (iip && iip->ili_last_lsn) { - xlog_t *log = mp->m_log; - xfs_lsn_t sync_lsn; - int s, log_flags = XFS_LOG_FORCE; - - s = GRANT_LOCK(log); - sync_lsn = log->l_last_sync_lsn; - GRANT_UNLOCK(log, s); - - if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0)) - return 0; - - if (flags & FLUSH_SYNC) - log_flags |= XFS_LOG_SYNC; - return xfs_log_force(mp, iip->ili_last_lsn, log_flags); - } - } - /* * We make this non-blocking if the inode is contended, * return EAGAIN to indicate to the caller that they @@ -3717,54 +2739,39 @@ xfs_inode_flush( * blocking on inodes inside another operation right * now, they get caught later by xfs_sync. */ - if (flags & FLUSH_INODE) { - int flush_flags; - - if (xfs_ipincount(ip)) - return EAGAIN; - - if (flags & FLUSH_SYNC) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return EAGAIN; - } - } else { + if (flags & FLUSH_SYNC) { + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_iflock(ip); + } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); return EAGAIN; } - - if (flags & FLUSH_SYNC) - flush_flags = XFS_IFLUSH_SYNC; - else - flush_flags = XFS_IFLUSH_ASYNC; - - error = xfs_iflush(ip, flush_flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); + } else { + return EAGAIN; } + error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC + : XFS_IFLUSH_ASYNC_NOBLOCK); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; } + int -xfs_set_dmattrs ( - bhv_desc_t *bdp, +xfs_set_dmattrs( + xfs_inode_t *ip, u_int evmask, - u_int16_t state, - cred_t *credp) + u_int16_t state) { - xfs_inode_t *ip; + xfs_mount_t *mp = ip->i_mount; xfs_trans_t *tp; - xfs_mount_t *mp; int error; if (!capable(CAP_SYS_ADMIN)) return XFS_ERROR(EPERM); - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -3777,37 +2784,32 @@ xfs_set_dmattrs ( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask; - ip->i_iocore.io_dmstate = ip->i_d.di_dmstate = state; + ip->i_d.di_dmevmask = evmask; + ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); IHOLD(ip); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); return error; } -STATIC int +int xfs_reclaim( - bhv_desc_t *bdp) + xfs_inode_t *ip) { - xfs_inode_t *ip; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - ip = XFS_BHVTOI(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + xfs_itrace_entry(ip); - ASSERT(!VN_MAPPED(vp)); + ASSERT(!VN_MAPPED(VFS_I(ip))); /* bad inode, get out here ASAP */ - if (VN_BAD(vp)) { + if (VN_BAD(VFS_I(ip))) { xfs_ireclaim(ip); return 0; } - vn_iowait(vp); + vn_iowait(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -3817,11 +2819,16 @@ xfs_reclaim( */ xfs_synchronize_atime(ip); - /* If we have nothing to flush with this inode then complete the - * teardown now, otherwise break the link between the xfs inode - * and the linux inode and clean up the xfs inode later. This - * avoids flushing the inode to disk during the delete operation - * itself. + /* + * If we have nothing to flush with this inode then complete the + * teardown now, otherwise break the link between the xfs inode and the + * linux inode and clean up the xfs inode later. This avoids flushing + * the inode to disk during the delete operation itself. + * + * When breaking the link, we need to set the XFS_IRECLAIMABLE flag + * first to ensure that xfs_iunpin() will never see an xfs inode + * that has a linux inode being reclaimed. Synchronisation is provided + * by the i_flags_lock. */ if (!ip->i_update_core && (ip->i_itemp == NULL)) { xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -3830,11 +2837,14 @@ xfs_reclaim( } else { xfs_mount_t *mp = ip->i_mount; - /* Protect sync from us */ + /* Protect sync and unpin from us */ XFS_MOUNT_ILOCK(mp); - vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); + spin_lock(&ip->i_flags_lock); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + VFS_I(ip)->i_private = NULL; + ip->i_vnode = NULL; + spin_unlock(&ip->i_flags_lock); list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); - ip->i_flags |= XFS_IRECLAIMABLE; XFS_MOUNT_IUNLOCK(mp); } return 0; @@ -3846,9 +2856,8 @@ xfs_finish_reclaim( int locked, int sync_mode) { - xfs_ihash_t *ih = ip->i_hash; - bhv_vnode_t *vp = XFS_ITOV_NULL(ip); - int error; + xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); + struct inode *vp = VFS_I(ip); if (vp && VN_BAD(vp)) goto reclaim; @@ -3858,18 +2867,22 @@ xfs_finish_reclaim( * Once we have the XFS_IRECLAIM flag set it will not touch * us. */ - write_lock(&ih->ih_lock); - if ((ip->i_flags & XFS_IRECLAIM) || - (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) { - write_unlock(&ih->ih_lock); + write_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + if (__xfs_iflags_test(ip, XFS_IRECLAIM) || + (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) { + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); if (locked) { xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); } return 1; } - ip->i_flags |= XFS_IRECLAIM; - write_unlock(&ih->ih_lock); + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); + xfs_put_perag(ip->i_mount, pag); /* * If the inode is still dirty, then flush it out. If the inode @@ -3882,42 +2895,23 @@ xfs_finish_reclaim( * We get the flush lock regardless, though, just to make sure * we don't free it while it is being flushed. */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - if (!locked) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - } - - if (ip->i_update_core || - ((ip->i_itemp != NULL) && - (ip->i_itemp->ili_format.ilf_fields != 0))) { - error = xfs_iflush(ip, sync_mode); - /* - * If we hit an error, typically because of filesystem - * shutdown, we don't need to let vn_reclaim to know - * because we're gonna reclaim the inode anyway. - */ - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto reclaim; - } - xfs_iflock(ip); /* synchronize with xfs_iflush_done */ - } + if (!locked) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + } - ASSERT(ip->i_update_core == 0); - ASSERT(ip->i_itemp == NULL || - ip->i_itemp->ili_format.ilf_fields == 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else if (locked) { - /* - * We are not interested in doing an iflush if we're - * in the process of shutting down the filesystem forcibly. - * So, just reclaim the inode. - */ + /* + * In the case of a forced shutdown we rely on xfs_iflush() to + * wait for the inode to be unpinned before returning an error. + */ + if (xfs_iflush(ip, sync_mode) == 0) { + /* synchronize with xfs_iflush_done */ + xfs_iflock(ip); xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + reclaim: xfs_ireclaim(ip); return 0; @@ -4003,27 +2997,21 @@ xfs_alloc_file_space( int committed; int error; - vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); + xfs_itrace_entry(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rt = XFS_IS_REALTIME_INODE(ip); - if (unlikely(rt)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) return error; if (len <= 0) return XFS_ERROR(EINVAL); + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + count = len; - error = 0; imapp = &imaps[0]; nimaps = 1; bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); @@ -4031,17 +3019,16 @@ xfs_alloc_file_space( allocatesize_fsb = XFS_B_TO_FSB(mp, count); /* Generate a DMAPI event if needed. */ - if (alloc_type != 0 && offset < ip->i_d.di_size && - (attr_flags&ATTR_DMI) == 0 && - DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { + if (alloc_type != 0 && offset < ip->i_size && + (attr_flags & XFS_ATTR_DMI) == 0 && + DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { xfs_off_t end_dmi_offset; end_dmi_offset = offset+len; - if (end_dmi_offset > ip->i_d.di_size) - end_dmi_offset = ip->i_d.di_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), - offset, end_dmi_offset - offset, - 0, NULL); + if (end_dmi_offset > ip->i_size) + end_dmi_offset = ip->i_size; + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset, + end_dmi_offset - offset, 0, NULL); if (error) return error; } @@ -4114,7 +3101,7 @@ retry: * Issue the xfs_bmapi() call to allocate the blocks */ XFS_BMAP_INIT(&free_list, &firstfsb); - error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb, + error = xfs_bmapi(tp, ip, startoffset_fsb, allocatesize_fsb, bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list, NULL); @@ -4125,12 +3112,12 @@ retry: /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { break; @@ -4147,12 +3134,11 @@ retry: allocatesize_fsb -= allocated_fsb; } dmapi_enospc_check: - if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 && - DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) { - + if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 && + DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, - XFS_ITOV(ip), DM_RIGHT_NULL, - XFS_ITOV(ip), DM_RIGHT_NULL, + ip, DM_RIGHT_NULL, + ip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error == 0) goto retry; /* Maybe DMAPI app. has made space */ @@ -4174,6 +3160,13 @@ error1: /* Just cancel transaction */ /* * Zero file bytes between startoff and endoff inclusive. * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. */ STATIC int xfs_zero_remaining_bytes( @@ -4190,14 +3183,25 @@ xfs_zero_remaining_bytes( int nimap; int error = 0; + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= ip->i_size) + return 0; + + if (endoff > ip->i_size) + endoff = ip->i_size; + bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, - ip->i_d.di_flags & XFS_DIFLAG_REALTIME ? + XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp); for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0, + error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); if (error || nimap < 1) break; @@ -4216,7 +3220,8 @@ xfs_zero_remaining_bytes( XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); xfsbdstrat(mp, bp); - if ((error = xfs_iowait(bp))) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", mp, bp, XFS_BUF_ADDR(bp)); break; @@ -4228,7 +3233,8 @@ xfs_zero_remaining_bytes( XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); xfsbdstrat(mp, bp); - if ((error = xfs_iowait(bp))) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", mp, bp, XFS_BUF_ADDR(bp)); break; @@ -4257,7 +3263,6 @@ xfs_free_file_space( xfs_off_t len, int attr_flags) { - bhv_vnode_t *vp; int committed; int done; xfs_off_t end_dmi_offset; @@ -4265,23 +3270,21 @@ xfs_free_file_space( int error; xfs_fsblock_t firstfsb; xfs_bmap_free_t free_list; - xfs_off_t ilen; xfs_bmbt_irec_t imap; xfs_off_t ioffset; xfs_extlen_t mod=0; xfs_mount_t *mp; int nimap; uint resblks; - int rounding; + uint rounding; int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; int need_iolock = 1; - vp = XFS_ITOV(ip); mp = ip->i_mount; - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + xfs_itrace_entry(ip); if ((error = XFS_QM_DQATTACH(mp, ip, 0))) return error; @@ -4289,42 +3292,37 @@ xfs_free_file_space( error = 0; if (len <= 0) /* if nothing being freed */ return error; - rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME); + rt = XFS_IS_REALTIME_INODE(ip); startoffset_fsb = XFS_B_TO_FSB(mp, offset); end_dmi_offset = offset + len; endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset); - if (offset < ip->i_d.di_size && - (attr_flags & ATTR_DMI) == 0 && - DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { - if (end_dmi_offset > ip->i_d.di_size) - end_dmi_offset = ip->i_d.di_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, + if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 && + DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { + if (end_dmi_offset > ip->i_size) + end_dmi_offset = ip->i_size; + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset, end_dmi_offset - offset, AT_DELAY_FLAG(attr_flags), NULL); if (error) return error; } - if (attr_flags & ATTR_NOLOCK) + if (attr_flags & XFS_ATTR_NOLOCK) need_iolock = 0; if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL); - vn_iowait(vp); /* wait for the completion of any pending DIOs */ + vn_iowait(ip); /* wait for the completion of any pending DIOs */ } - rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog), - (__uint8_t)NBPP); - ilen = len + (offset & (rounding - 1)); + rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); - if (ilen & (rounding - 1)) - ilen = (ilen + rounding) & ~(rounding - 1); - - if (VN_CACHED(vp) != 0) { - xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1, - ctooff(offtoct(ioffset)), -1); - bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)), - -1, FI_REMAPF_LOCKED); + + if (VN_CACHED(VFS_I(ip)) != 0) { + xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1); + error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); + if (error) + goto out_unlock_iolock; } /* @@ -4333,9 +3331,9 @@ xfs_free_file_space( * actually need to zero the extent edges. Otherwise xfs_bunmapi * will take care of it for us. */ - if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { nimap = 1; - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb, + error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); if (error) goto out_unlock_iolock; @@ -4350,7 +3348,7 @@ xfs_free_file_space( startoffset_fsb += mp->m_sb.sb_rextsize - mod; } nimap = 1; - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1, + error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); if (error) goto out_unlock_iolock; @@ -4388,9 +3386,12 @@ xfs_free_file_space( while (!error && !done) { /* - * allocate and setup the transaction + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), @@ -4423,7 +3424,7 @@ xfs_free_file_space( * issue the bunmapi() call to free the blocks */ XFS_BMAP_INIT(&free_list, &firstfsb); - error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb, + error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, 0, 2, &firstfsb, &free_list, NULL, &done); if (error) { @@ -4433,12 +3434,12 @@ xfs_free_file_space( /* * complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -4469,46 +3470,28 @@ xfs_free_file_space( */ int xfs_change_file_space( - bhv_desc_t *bdp, + xfs_inode_t *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, cred_t *credp, int attr_flags) { + xfs_mount_t *mp = ip->i_mount; int clrprealloc; int error; xfs_fsize_t fsize; - xfs_inode_t *ip; - xfs_mount_t *mp; int setprealloc; xfs_off_t startoffset; xfs_off_t llen; xfs_trans_t *tp; - bhv_vattr_t va; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + struct iattr iattr; - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; + xfs_itrace_entry(ip); - /* - * must be a regular file and have write permission - */ - if (!VN_ISREG(vp)) + if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); - xfs_ilock(ip, XFS_ILOCK_SHARED); - - if ((error = xfs_iaccess(ip, S_IWUSR, credp))) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; - } - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; @@ -4516,7 +3499,7 @@ xfs_change_file_space( bf->l_start += offset; break; case 2: /*SEEK_END*/ - bf->l_start += ip->i_d.di_size; + bf->l_start += ip->i_size; break; default: return XFS_ERROR(EINVAL); @@ -4533,7 +3516,7 @@ xfs_change_file_space( bf->l_whence = 0; startoffset = bf->l_start; - fsize = ip->i_d.di_size; + fsize = ip->i_size; /* * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve @@ -4576,10 +3559,10 @@ xfs_change_file_space( break; } - va.va_mask = XFS_AT_SIZE; - va.va_size = startoffset; + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = startoffset; - error = xfs_setattr(bdp, &va, attr_flags, credp); + error = xfs_setattr(ip, &iattr, attr_flags, credp); if (error) return error; @@ -4609,7 +3592,7 @@ xfs_change_file_space( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - if ((attr_flags & ATTR_DMI) == 0) { + if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; /* @@ -4632,56 +3615,9 @@ xfs_change_file_space( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } - -bhv_vnodeops_t xfs_vnodeops = { - BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS), - .vop_open = xfs_open, - .vop_close = xfs_close, - .vop_read = xfs_read, -#ifdef HAVE_SENDFILE - .vop_sendfile = xfs_sendfile, -#endif -#ifdef HAVE_SPLICE - .vop_splice_read = xfs_splice_read, - .vop_splice_write = xfs_splice_write, -#endif - .vop_write = xfs_write, - .vop_ioctl = xfs_ioctl, - .vop_getattr = xfs_getattr, - .vop_setattr = xfs_setattr, - .vop_access = xfs_access, - .vop_lookup = xfs_lookup, - .vop_create = xfs_create, - .vop_remove = xfs_remove, - .vop_link = xfs_link, - .vop_rename = xfs_rename, - .vop_mkdir = xfs_mkdir, - .vop_rmdir = xfs_rmdir, - .vop_readdir = xfs_readdir, - .vop_symlink = xfs_symlink, - .vop_readlink = xfs_readlink, - .vop_fsync = xfs_fsync, - .vop_inactive = xfs_inactive, - .vop_fid2 = xfs_fid2, - .vop_rwlock = xfs_rwlock, - .vop_rwunlock = xfs_rwunlock, - .vop_bmap = xfs_bmap, - .vop_reclaim = xfs_reclaim, - .vop_attr_get = xfs_attr_get, - .vop_attr_set = xfs_attr_set, - .vop_attr_remove = xfs_attr_remove, - .vop_attr_list = xfs_attr_list, - .vop_link_removed = (vop_link_removed_t)fs_noval, - .vop_vnode_change = (vop_vnode_change_t)fs_noval, - .vop_tosspages = fs_tosspages, - .vop_flushinval_pages = fs_flushinval_pages, - .vop_flush_pages = fs_flush_pages, - .vop_release = xfs_release, - .vop_iflush = xfs_inode_flush, -}; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h new file mode 100644 index 0000000..e932a96 --- /dev/null +++ b/fs/xfs/xfs_vnodeops.h @@ -0,0 +1,82 @@ +#ifndef _XFS_VNODEOPS_H +#define _XFS_VNODEOPS_H 1 + +struct attrlist_cursor_kern; +struct cred; +struct file; +struct iattr; +struct inode; +struct iovec; +struct kiocb; +struct pipe_inode_info; +struct uio; +struct xfs_inode; +struct xfs_iomap; + + +int xfs_open(struct xfs_inode *ip); +int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags, + struct cred *credp); +#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ +#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ +#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ + +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_fsync(struct xfs_inode *ip); +int xfs_release(struct xfs_inode *ip); +int xfs_inactive(struct xfs_inode *ip); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, + xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name, + mode_t mode, struct xfs_inode **ipp, struct cred *credp); +int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, + xfs_off_t *offset, filldir_t filldir); +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, mode_t mode, struct xfs_inode **ipp, + struct cred *credp); +int xfs_inode_flush(struct xfs_inode *ip, int flags); +int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); +int xfs_reclaim(struct xfs_inode *ip); +int xfs_change_file_space(struct xfs_inode *ip, int cmd, + xfs_flock64_t *bf, xfs_off_t offset, + struct cred *credp, int attr_flags); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name, struct xfs_inode *target_ip); +int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, + int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, + int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); +int xfs_ioctl(struct xfs_inode *ip, struct file *filp, + int ioflags, unsigned int cmd, void __user *arg); +ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, + const struct iovec *iovp, unsigned int segs, + loff_t *offset, int ioflags); +ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t count, + int flags, int ioflags); +ssize_t xfs_splice_write(struct xfs_inode *ip, + struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t count, int flags, int ioflags); +ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb, + const struct iovec *iovp, unsigned int nsegs, + loff_t *offset, int ioflags); +int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int flags, struct xfs_iomap *iomapp, int *niomaps); +void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, + xfs_off_t last, int fiopt); +int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, + xfs_off_t last, int fiopt); +int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, + xfs_off_t last, uint64_t flags, int fiopt); + +#endif /* _XFS_VNODEOPS_H */