/* * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" #include "xfs_discard.h" /* * Perform initial CIL structure initialisation. If the CIL is not * enabled in this filesystem, ensure the log->l_cilp is null so * we can check this conditional to determine if we are doing delayed * logging or not. */ int xlog_cil_init( struct log *log) { struct xfs_cil *cil; struct xfs_cil_ctx *ctx; log->l_cilp = NULL; if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) return 0; cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); if (!cil) return ENOMEM; ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); if (!ctx) { kmem_free(cil); return ENOMEM; } INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; cil->xc_current_sequence = ctx->sequence; cil->xc_log = log; log->l_cilp = cil; return 0; } void xlog_cil_destroy( struct log *log) { if (!log->l_cilp) return; if (log->l_cilp->xc_ctx) { if (log->l_cilp->xc_ctx->ticket) xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); kmem_free(log->l_cilp->xc_ctx); } ASSERT(list_empty(&log->l_cilp->xc_cil)); kmem_free(log->l_cilp); } /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to * recover, so we don't allow failure here. Also, we allocate in a context that * we don't want to be issuing transactions from, so we need to tell the * allocation code this as well. * * We don't reserve any space for the ticket - we are going to steal whatever * space we require from transactions as they commit. To ensure we reserve all * the space required, we need to set the current reservation of the ticket to * zero so that we know to steal the initial transaction overhead from the * first transaction commit. */ static struct xlog_ticket * xlog_cil_ticket_alloc( struct log *log) { struct xlog_ticket *tic; tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, KM_SLEEP|KM_NOFS); tic->t_trans_type = XFS_TRANS_CHECKPOINT; /* * set the current reservation to zero so we know to steal the basic * transaction overhead reservation from the first transaction commit. */ tic->t_curr_res = 0; return tic; } /* * After the first stage of log recovery is done, we know where the head and * tail of the log are. We need this log initialisation done before we can * initialise the first CIL checkpoint context. * * Here we allocate a log ticket to track space usage during a CIL push. This * ticket is passed to xlog_write() directly so that we don't slowly leak log * space by failing to account for space used by log headers and additional * region headers for split regions. */ void xlog_cil_init_post_recovery( struct log *log) { if (!log->l_cilp) return; log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); } /* * Format log item into a flat buffers * * For delayed logging, we need to hold a formatted buffer containing all the * changes on the log item. This enables us to relog the item in memory and * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * * This function builds a vector for the changes in each log item in the * transaction. It then works out the length of the buffer needed for each log * item, allocates them and formats the vector for the item into the buffer. * The buffer is then attached to the log item are then inserted into the * Committed Item List for tracking until the next checkpoint is written out. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a * formatting step to write the regions into the iclog buffer. Writing the * ophdrs during the iclog write means that we can support splitting large * regions across iclog boundares without needing a change in the format of the * item/region encapsulation. * * Hence what we need to do now is change the rewrite the vector array to point * to the copied region inside the buffer we just allocated. This allows us to * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ static void xlog_cil_format_items( struct log *log, struct xfs_log_vec *log_vector) { struct xfs_log_vec *lv; ASSERT(log_vector); for (lv = log_vector; lv; lv = lv->lv_next) { void *ptr; int index; int len = 0; /* build the vector array and calculate it's length */ IOP_FORMAT(lv->lv_item, lv->lv_iovecp); for (index = 0; index < lv->lv_niovecs; index++) len += lv->lv_iovecp[index].i_len; lv->lv_buf_len = len; lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); ptr = lv->lv_buf; for (index = 0; index < lv->lv_niovecs; index++) { struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; memcpy(ptr, vec->i_addr, vec->i_len); vec->i_addr = ptr; ptr += vec->i_len; } ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); } } /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as * well. */ STATIC void xfs_cil_prepare_item( struct log *log, struct xfs_log_vec *lv, int *len, int *diff_iovecs) { struct xfs_log_vec *old = lv->lv_item->li_lv; if (old) { /* existing lv on log item, space used is a delta */ ASSERT(!list_empty(&lv->lv_item->li_cil)); ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); *len += lv->lv_buf_len - old->lv_buf_len; *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; kmem_free(old->lv_buf); kmem_free(old); } else { /* new lv, must pin the log item */ ASSERT(!lv->lv_item->li_lv); ASSERT(list_empty(&lv->lv_item->li_cil)); *len += lv->lv_buf_len; *diff_iovecs += lv->lv_niovecs; IOP_PIN(lv->lv_item); } /* attach new log vector to log item */ lv->lv_item->li_lv = lv; /* * If this is the first time the item is being committed to the * CIL, store the sequence number on the log item so we can * tell in future commits whether this is the first checkpoint * the item is being committed into. */ if (!lv->lv_item->li_seq) lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; } /* * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space * as well. Remove the amount of space we addded to the checkpoint ticket from * the current transaction ticket so that the accounting works out correctly. */ static void xlog_cil_insert_items( struct log *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_vec *lv; int len = 0; int diff_iovecs = 0; int iclog_space; ASSERT(log_vector); /* * Do all the accounting aggregation and switching of log vectors * around in a separate loop to the insertion of items into the CIL. * Then we can do a separate loop to update the CIL within a single * lock/unlock pair. This reduces the number of round trips on the CIL * lock from O(nr_logvectors) to O(1) and greatly reduces the overall * hold time for the transaction commit. * * If this is the first time the item is being placed into the CIL in * this context, pin it so it can't be written to disk until the CIL is * flushed to the iclog and the iclog written to disk. * * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ for (lv = log_vector; lv; lv = lv->lv_next) xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); /* account for space used by new iovec headers */ len += diff_iovecs * sizeof(xlog_op_header_t); spin_lock(&cil->xc_cil_lock); /* move the items to the tail of the CIL */ for (lv = log_vector; lv; lv = lv->lv_next) list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); ctx->nvecs += diff_iovecs; /* * Now transfer enough transaction reservation to the context ticket * for the checkpoint. The context ticket is special - the unit * reservation has to grow as well as the current reservation as we * steal from tickets so we can correctly determine the space used * during the transaction commit. */ if (ctx->ticket->t_curr_res == 0) { /* first commit in checkpoint, steal the header reservation */ ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; ticket->t_curr_res -= ctx->ticket->t_unit_res; } /* do we need space for more log record headers? */ iclog_space = log->l_iclog_size - log->l_iclog_hsize; if (len > 0 && (ctx->space_used / iclog_space != (ctx->space_used + len) / iclog_space)) { int hdrs; hdrs = (len + iclog_space - 1) / iclog_space; /* need to take into account split region headers, too */ hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); ctx->ticket->t_unit_res += hdrs; ctx->ticket->t_curr_res += hdrs; ticket->t_curr_res -= hdrs; ASSERT(ticket->t_curr_res >= len); } ticket->t_curr_res -= len; ctx->space_used += len; spin_unlock(&cil->xc_cil_lock); } static void xlog_cil_free_logvec( struct xfs_log_vec *log_vector) { struct xfs_log_vec *lv; for (lv = log_vector; lv; ) { struct xfs_log_vec *next = lv->lv_next; kmem_free(lv->lv_buf); kmem_free(lv); lv = next; } } /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as * possible. */ static void xlog_cil_committed( void *args, int abort) { struct xfs_cil_ctx *ctx = args; struct xfs_mount *mp = ctx->cil->xc_log->l_mp; xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); xfs_alloc_busy_sort(&ctx->busy_extents); xfs_alloc_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); spin_lock(&ctx->cil->xc_cil_lock); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_cil_lock); xlog_cil_free_logvec(ctx->lv_chain); if (!list_empty(&ctx->busy_extents)) { ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); xfs_discard_extents(mp, &ctx->busy_extents); xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); } kme
/*
 * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#ifndef __XFS_ATTR_LEAF_H__
#define	__XFS_ATTR_LEAF_H__

/*
 * Attribute storage layout, internal structure, access macros, etc.
 *
 * Attribute lists are structured around Btrees where all the data
 * elements are in the leaf nodes.  Attribute names are hashed into an int,
 * then that int is used as the index into the Btree.  Since the hashval
 * of an attribute name may not be unique, we may have duplicate keys.  The
 * internal links in the Btree are logical block offsets into the file.
 */

struct attrlist;
struct attrlist_cursor_kern;
struct xfs_attr_list_context;
struct xfs_dabuf;
struct xfs_da_args;
struct xfs_da_state;
struct xfs_da_state_blk;
struct xfs_inode;
struct xfs_trans;

/*========================================================================
 * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
 *========================================================================*/

/*
 * This is the structure of the leaf nodes in the Btree.
 *
 * Struct leaf_entry's are packed from the top.  Name/values grow from the
 * bottom but are not packed.  The freemap contains run-length-encoded entries
 * for the free bytes after the leaf_entry's, but only the N largest such,
 * smaller runs are dropped.  When the freemap doesn't show enough space
 * for an allocation, we compact the name/value area and try again.  If we
 * still don't have enough space, then we have to split the block.  The
 * name/value structs (both local and remote versions) must be 32bit aligned.
 *
 * Since we have duplicate hash keys, for each key that matches, compare
 * the actual name string.  The root and intermediate node search always
 * takes the first-in-the-block key match found, so we should only have
 * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
 * nodes until the hash key changes or the attribute name is found.
 *
 * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
 * the leaf_entry.  The namespaces are independent only because we also look
 * at the namespace bit when we are looking for a matching attribute name.
 *
 * We also store an "incomplete" bit in the leaf_entry.  It shows that an
 * attribute is in the middle of being created and should not be shown to
 * the user if we crash during the time that the bit is set.  We clear the
 * bit when we have finished setting up the attribute.  We do this because
 * we cannot create some large attributes inside a single transaction, and we
 * need some indication that we weren't finished if we crash in the middle.
 */
#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */

typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
	__be16	base;			  /* base of free region */
	__be16	size;			  /* length of free region */
} xfs_attr_leaf_map_t;

typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
	xfs_da_blkinfo_t info;		/* block type, links, etc. */
	__be16	count;			/* count of active leaf_entry's */
	__be16	usedbytes;		/* num bytes of names/values stored */
	__be16	firstused;		/* first used byte in name area */
	__u8	holes;			/* != 0 if blk needs compaction */
	__u8	pad1;
	xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
					/* N largest free regions */
} xfs_attr_leaf_hdr_t;

typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
	__be32	hashval;		/* hash value of name */
 	__be16	nameidx;		/* index into buffer of name/value */
	__u8	flags;			/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
	__u8	pad2;			/* unused pad byte */
} xfs_attr_leaf_entry_t;

typedef struct xfs_attr_leaf_name_local {
	__be16	valuelen;		/* number of bytes in value */
	__u8	namelen;		/* length of name bytes */
	__u8	nameval[1];		/* name/value bytes */
} xfs_attr_leaf_name_local_t;

typedef struct xfs_attr_leaf_name_remote {
	__be32	valueblk;		/* block number of value bytes */
	__be32	valuelen;		/* number of bytes in value */
	__u8	namelen;		/* length of name bytes */
	__u8	name[1];		/* name bytes */
} xfs_attr_leaf_name_remote_t;

typedef struct xfs_attr_leafblock {
	xfs_attr_leaf_hdr_t	hdr;	/* constant-structure header block */
	xfs_attr_leaf_entry_t	entries[1];	/* sorted on key, not name */
	xfs_attr_leaf_name_local_t namelist;	/* grows from bottom of buf */
	xfs_attr_leaf_name_remote_t valuelist;	/* grows from bottom of buf */
} xfs_attr_leafblock_t;

/*
 * Flags used in the leaf_entry[i].flags field.
 * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
 * on the system call, they are "or"ed together for various operations.
 */
#define	XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
#define	XFS_ATTR_ROOT_BIT	1	/* limit access to trusted attrs */
#define	XFS_ATTR_SECURE_BIT	2	/* limit access to secure attrs */
#define	XFS_ATTR_INCOMPLETE_BIT	7	/* attr in middle of create/delete */
#define XFS_ATTR_LOCAL		(1 << XFS_ATTR_LOCAL_BIT)
#define XFS_ATTR_ROOT		(1 << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE		(1 << XFS_ATTR_SECURE_BIT)
#define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)

/*
 * Conversion macros for converting namespace bits from argument flags
 * to ondisk flags.
 */
#define XFS_ATTR_NSP_ARGS_MASK		(ATTR_ROOT | ATTR_SECURE)
#define XFS_ATTR_NSP_ONDISK_MASK	(XFS_ATTR_ROOT | XFS_ATTR_SECURE)
#define XFS_ATTR_NSP_ONDISK(flags)	((flags) & XFS_ATTR_NSP_ONDISK_MASK)
#define XFS_ATTR_NSP_ARGS(flags)	((flags) & XFS_ATTR_NSP_ARGS_MASK)
#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)	(((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
					 ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)	(((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
					 ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))

/*
 * Alignment for namelist and valuelist entries (since they are mixed
 * there can be only one alignment value)
 */
#define	XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))

/*
 * Cast typed pointers for "local" and "remote" name/value structs.
 */
static inline xfs_attr_leaf_name_remote_t *
xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
{
	return (xfs_attr_leaf_name_remote_t *)
		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
}

static inline xfs_attr_leaf_name_local_t *
xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
{
	return (xfs_attr_leaf_name_local_t *)
		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
}

static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
{
	return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
}

/*
 * Calculate total bytes used (including trailing pad for alignment) for
 * a "local" name/value structure, a "remote" name/value structure, and
 * a pointer which might be either.
 */
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
	return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
}

static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
	return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
}

static inline int xfs_attr_leaf_entsize_local_max(int bsize)
{
	return (((bsize) >> 1) + ((bsize) >> 2));
}

/*
 * Used to keep a list of "remote value" extents when unlinking an inode.
 */
typedef struct xfs_attr_inactive_list {
	xfs_dablk_t	valueblk;	/* block number of value bytes */
	int		valuelen;	/* number of bytes in value */
} xfs_attr_inactive_list_t;


/*========================================================================
 * Function prototypes for the kernel.
 *========================================================================*/

/*
 * Internal routines when attribute fork size < XFS_LITINO(mp).
 */
void	xfs_attr_shortform_create(struct xfs_da_args *args);
void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int	xfs_attr_shortform_remove(struct xfs_da_args *args);
int	xfs_attr_shortform_list(struct xfs_attr_list_context *context);
int	xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);


/*
 * Internal routines when attribute fork size == XFS_LBSIZE(mp).
 */
int	xfs_attr_leaf_to_node(struct xfs_da_args *args);
int	xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
				   struct xfs_da_args *args, int forkoff);
int	xfs_attr_leaf_clearflag(struct xfs_da_args *args);
int	xfs_attr_leaf_setflag(struct xfs_da_args *args);
int	xfs_attr_leaf_flipflags(xfs_da_args_t *args);

/*
 * Routines used for growing the Btree.
 */
int	xfs_attr_leaf_split(struct xfs_da_state *state,
				   struct xfs_da_state_blk *oldblk,
				   struct xfs_da_state_blk *newblk);
int	xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
					struct xfs_da_args *args);
int	xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
int	xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
				 struct xfs_da_args *args);
int	xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
				    struct xfs_da_args *args);
int	xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
				      struct xfs_attr_list_context *context);

/*
 * Routines used for shrinking the Btree.
 */
int	xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
void	xfs_attr_leaf_unbalance(struct xfs_da_state *state,
				       struct xfs_da_state_blk *drop_blk,
				       struct xfs_da_state_blk *save_blk);
int	xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);

/*
 * Utility routines.
 */
xfs_dahash_t	xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
				   struct xfs_dabuf *leaf2_bp);
int	xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
					int *local);
#endif	/* __XFS_ATTR_LEAF_H__ */