/* * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" #include "xfs_discard.h" /* * Perform initial CIL structure initialisation. If the CIL is not * enabled in this filesystem, ensure the log->l_cilp is null so * we can check this conditional to determine if we are doing delayed * logging or not. */ int xlog_cil_init( struct log *log) { struct xfs_cil *cil; struct xfs_cil_ctx *ctx; log->l_cilp = NULL; if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) return 0; cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); if (!cil) return ENOMEM; ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); if (!ctx) { kmem_free(cil); return ENOMEM; } INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; cil->xc_current_sequence = ctx->sequence; cil->xc_log = log; log->l_cilp = cil; return 0; } void xlog_cil_destroy( struct log *log) { if (!log->l_cilp) return; if (log->l_cilp->xc_ctx) { if (log->l_cilp->xc_ctx->ticket) xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); kmem_free(log->l_cilp->xc_ctx); } ASSERT(list_empty(&log->l_cilp->xc_cil)); kmem_free(log->l_cilp); } /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to * recover, so we don't allow failure here. Also, we allocate in a context that * we don't want to be issuing transactions from, so we need to tell the * allocation code this as well. * * We don't reserve any space for the ticket - we are going to steal whatever * space we require from transactions as they commit. To ensure we reserve all * the space required, we need to set the current reservation of the ticket to * zero so that we know to steal the initial transaction overhead from the * first transaction commit. */ static struct xlog_ticket * xlog_cil_ticket_alloc( struct log *log) { struct xlog_ticket *tic; tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, KM_SLEEP|KM_NOFS); tic->t_trans_type = XFS_TRANS_CHECKPOINT; /* * set the current reservation to zero so we know to steal the basic * transaction overhead reservation from the first transaction commit. */ tic->t_curr_res = 0; return tic; } /* * After the first stage of log recovery is done, we know where the head and * tail of the log are. We need this log initialisation done before we can * initialise the first CIL checkpoint context. * * Here we allocate a log ticket to track space usage during a CIL push. This * ticket is passed to xlog_write() directly so that we don't slowly leak log * space by failing to account for space used by log headers and additional * region headers for split regions. */ void xlog_cil_init_post_recovery( struct log *log) { if (!log->l_cilp) return; log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); } /* * Format log item into a flat buffers * * For delayed logging, we need to hold a formatted buffer containing all the * changes on the log item. This enables us to relog the item in memory and * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * * This function builds a vector for the changes in each log item in the * transaction. It then works out the length of the buffer needed for each log * item, allocates them and formats the vector for the item into the buffer. * The buffer is then attached to the log item are then inserted into the * Committed Item List for tracking until the next checkpoint is written out. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a * formatting step to write the regions into the iclog buffer. Writing the * ophdrs during the iclog write means that we can support splitting large * regions across iclog boundares without needing a change in the format of the * item/region encapsulation. * * Hence what we need to do now is change the rewrite the vector array to point * to the copied region inside the buffer we just allocated. This allows us to * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ static void xlog_cil_format_items( struct log *log, struct xfs_log_vec *log_vector) { struct xfs_log_vec *lv; ASSERT(log_vector); for (lv = log_vector; lv; lv = lv->lv_next) { void *ptr; int index; int len = 0; /* build the vector array and calculate it's length */ IOP_FORMAT(lv->lv_item, lv->lv_iovecp); for (index = 0; index < lv->lv_niovecs; index++) len += lv->lv_iovecp[index].i_len; lv->lv_buf_len = len; lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); ptr = lv->lv_buf; for (index = 0; index < lv->lv_niovecs; index++) { struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; memcpy(ptr, vec->i_addr, vec->i_len); vec->i_addr = ptr; ptr += vec->i_len; } ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); } } /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as * well. */ STATIC void xfs_cil_prepare_item( struct log *log, struct xfs_log_vec *lv, int *len, int *diff_iovecs) { struct xfs_log_vec *old = lv->lv_item->li_lv; if (old) { /* existing lv on log item, space used is a delta */ ASSERT(!list_empty(&lv->lv_item->li_cil)); ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); *len += lv->lv_buf_len - old->lv_buf_len; *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; kmem_free(old->lv_buf); kmem_free(old); } else { /* new lv, must pin the log item */ ASSERT(!lv->lv_item->li_lv); ASSERT(list_empty(&lv->lv_item->li_cil)); *len += lv->lv_buf_len; *diff_iovecs += lv->lv_niovecs; IOP_PIN(lv->lv_item); } /* attach new log vector to log item */ lv->lv_item->li_lv = lv; /* * If this is the first time the item is being committed to the * CIL, store the sequence number on the log item so we can * tell in future commits whether this is the first checkpoint * the item is being committed into. */ if (!lv->lv_item->li_seq) lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; } /* * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space * as well. Remove the amount of space we addded to the checkpoint ticket from * the current transaction ticket so that the accounting works out correctly. */ static void xlog_cil_insert_items( struct log *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_vec *lv; int len = 0; int diff_iovecs = 0; int iclog_space; ASSERT(log_vector); /* * Do all the accounting aggregation and switching of log vectors * around in a separate loop to the insertion of items into the CIL. * Then we can do a separate loop to update the CIL within a single * lock/unlock pair. This reduces the number of round trips on the CIL * lock from O(nr_logvectors) to O(1) and greatly reduces the overall * hold time for the transaction commit. * * If this is the first time the item is being placed into the CIL in * this context, pin it so it can't be written to disk until the CIL is * flushed to the iclog and the iclog written to disk. * * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ for (lv = log_vector; lv; lv = lv->lv_next) xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); /* account for space used by new iovec headers */ len += diff_iovecs * sizeof(xlog_op_header_t); spin_lock(&cil->xc_cil_lock); /* move the items to the tail of the CIL */ for (lv = log_vector; lv; lv = lv->lv_next) list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); ctx->nvecs += diff_iovecs; /* * Now transfer enough transaction reservation to the context ticket * for the checkpoint. The context ticket is special - the unit * reservation has to grow as well as the current reservation as we * steal from tickets so we can correctly determine the space used * during the transaction commit. */ if (ctx->ticket->t_curr_res == 0) { /* first commit in checkpoint, steal the header reservation */ ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; ticket->t_curr_res -= ctx->ticket->t_unit_res; } /* do we need space for more log record headers? */ iclog_space = log->l_iclog_size - log->l_iclog_hsize; if (len > 0 && (ctx->space_used / iclog_space != (ctx->space_used + len) / iclog_space)) { int hdrs; hdrs = (len + iclog_space - 1) / iclog_space; /* need to take into account split region headers, too */ hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); ctx->ticket->t_unit_res += hdrs; ctx->ticket->t_curr_res += hdrs; ticket->t_curr_res -= hdrs; ASSERT(ticket->t_curr_res >= len); } ticket->t_curr_res -= len; ctx->space_used += len; spin_unlock(&cil->xc_cil_lock); } static void xlog_cil_free_logvec( struct xfs_log_vec *log_vector) { struct xfs_log_vec *lv; for (lv = log_vector; lv; ) { struct xfs_log_vec *next = lv->lv_next; kmem_free(lv->lv_buf); kmem_free(lv); lv = next; } } /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as * possible. */ static void xlog_cil_committed( void *args, int abort) { struct xfs_cil_ctx *ctx = args; struct xfs_mount *mp = ctx->cil->xc_log->l_mp; xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); xfs_alloc_busy_sort(&ctx->busy_extents); xfs_alloc_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); spin_lock(&ctx->cil->xc_cil_lock); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_cil_lock); xlog_cil_free_logvec(ctx->lv_chain); if (!list_empty(&ctx->busy_extents)) { ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); xfs_discard_extents(mp, &ctx->busy_extents); xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); } kme
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef __XFS_ATTR_LEAF_H__
#define __XFS_ATTR_LEAF_H__
/*
* Attribute storage layout, internal structure, access macros, etc.
*
* Attribute lists are structured around Btrees where all the data
* elements are in the leaf nodes. Attribute names are hashed into an int,
* then that int is used as the index into the Btree. Since the hashval
* of an attribute name may not be unique, we may have duplicate keys. The
* internal links in the Btree are logical block offsets into the file.
*/
struct attrlist;
struct attrlist_cursor_kern;
struct xfs_attr_list_context;
struct xfs_dabuf;
struct xfs_da_args;
struct xfs_da_state;
struct xfs_da_state_blk;
struct xfs_inode;
struct xfs_trans;
/*========================================================================
* Attribute structure when equal to XFS_LBSIZE(mp) bytes.
*========================================================================*/
/*
* This is the structure of the leaf nodes in the Btree.
*
* Struct leaf_entry's are packed from the top. Name/values grow from the
* bottom but are not packed. The freemap contains run-length-encoded entries
* for the free bytes after the leaf_entry's, but only the N largest such,
* smaller runs are dropped. When the freemap doesn't show enough space
* for an allocation, we compact the name/value area and try again. If we
* still don't have enough space, then we have to split the block. The
* name/value structs (both local and remote versions) must be 32bit aligned.
*
* Since we have duplicate hash keys, for each key that matches, compare
* the actual name string. The root and intermediate node search always
* takes the first-in-the-block key match found, so we should only have
* to work "forw"ard. If none matches, continue with the "forw"ard leaf
* nodes until the hash key changes or the attribute name is found.
*
* We store the fact that an attribute is a ROOT/USER/SECURE attribute in
* the leaf_entry. The namespaces are independent only because we also look
* at the namespace bit when we are looking for a matching attribute name.
*
* We also store an "incomplete" bit in the leaf_entry. It shows that an
* attribute is in the middle of being created and should not be shown to
* the user if we crash during the time that the bit is set. We clear the
* bit when we have finished setting up the attribute. We do this because
* we cannot create some large attributes inside a single transaction, and we
* need some indication that we weren't finished if we crash in the middle.
*/
#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
__be16 base; /* base of free region */
__be16 size; /* length of free region */
} xfs_attr_leaf_map_t;
typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
xfs_da_blkinfo_t info; /* block type, links, etc. */
__be16 count; /* count of active leaf_entry's */
__be16 usedbytes; /* num bytes of names/values stored */
__be16 firstused; /* first used byte in name area */
__u8 holes; /* != 0 if blk needs compaction */
__u8 pad1;
xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
/* N largest free regions */
} xfs_attr_leaf_hdr_t;
typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
__be32 hashval; /* hash value of name */
__be16 nameidx; /* index into buffer of name/value */
__u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
__u8 pad2; /* unused pad byte */
} xfs_attr_leaf_entry_t;
typedef struct xfs_attr_leaf_name_local {
__be16 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
__u8 nameval[1]; /* name/value bytes */
} xfs_attr_leaf_name_local_t;
typedef struct xfs_attr_leaf_name_remote {
__be32 valueblk; /* block number of value bytes */
__be32 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
__u8 name[1]; /* name bytes */
} xfs_attr_leaf_name_remote_t;
typedef struct xfs_attr_leafblock {
xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
} xfs_attr_leafblock_t;
/*
* Flags used in the leaf_entry[i].flags field.
* NOTE: the INCOMPLETE bit must not collide with the flags bits specified
* on the system call, they are "or"ed together for various operations.
*/
#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
/*
* Conversion macros for converting namespace bits from argument flags
* to ondisk flags.
*/
#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
/*
* Alignment for namelist and valuelist entries (since they are mixed
* there can be only one alignment value)
*/
#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
/*
* Cast typed pointers for "local" and "remote" name/value structs.
*/
static inline xfs_attr_leaf_name_remote_t *
xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
{
return (xfs_attr_leaf_name_remote_t *)
&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
}
static inline xfs_attr_leaf_name_local_t *
xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
{
return (xfs_attr_leaf_name_local_t *)
&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
}
static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
{
return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
}
/*
* Calculate total bytes used (including trailing pad for alignment) for
* a "local" name/value structure, a "remote" name/value structure, and
* a pointer which might be either.
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
{
return (((bsize) >> 1) + ((bsize) >> 2));
}
/*
* Used to keep a list of "remote value" extents when unlinking an inode.
*/
typedef struct xfs_attr_inactive_list {
xfs_dablk_t valueblk; /* block number of value bytes */
int valuelen; /* number of bytes in value */
} xfs_attr_inactive_list_t;
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
/*
* Internal routines when attribute fork size < XFS_LITINO(mp).
*/
void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
/*
* Internal routines when attribute fork size == XFS_LBSIZE(mp).
*/
int xfs_attr_leaf_to_node(struct xfs_da_args *args);
int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
struct xfs_da_args *args, int forkoff);
int xfs_attr_leaf_clearflag(struct xfs_da_args *args);
int xfs_attr_leaf_setflag(struct xfs_da_args *args);
int xfs_attr_leaf_flipflags(xfs_da_args_t *args);
/*
* Routines used for growing the Btree.
*/
int xfs_attr_leaf_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk,
struct xfs_da_state_blk *newblk);
int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
struct xfs_da_args *args);
int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
struct xfs_da_args *args);
int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
struct xfs_da_args *args);
int xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
struct xfs_attr_list_context *context);
/*
* Routines used for shrinking the Btree.
*/
int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
void xfs_attr_leaf_unbalance(struct xfs_da_state *state,
struct xfs_da_state_blk *drop_blk,
struct xfs_da_state_blk *save_blk);
int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
/*
* Utility routines.
*/
xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
struct xfs_dabuf *leaf2_bp);
int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
int *local);
#endif /* __XFS_ATTR_LEAF_H__ */