X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fxfs%2Fxfs_log.c;h=0b02c6443551f78b24af282cad994c5b07a2523f;hb=f5eb8e7ca58bc1e92436614444006120d21668ba;hp=4a6f7c5d145933c2781bf18d7971368545f3bcc7;hpb=6b1d1a732f886936fe515d911b1a01d9cc50e179;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4a6f7c5..0b02c64 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE 2048 +#define XLOG_TRACE_ICLOG_SIZE 256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ + ktrace_free(log->l_grant_trace); +} + void xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) { unsigned long cnts; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } /* ticket counts are 1 byte each */ cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) } void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ + ktrace_free(iclog->ic_trace); +} + +void xlog_trace_iclog(xlog_in_core_t *iclog, uint state) { - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_SLEEP); ktrace_enter(iclog->ic_trace, (void *)((unsigned long)state), (void *)((unsigned long)current_pid()), @@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) (void *)NULL, (void *)NULL); } #else + +#define xlog_trace_loggrant_alloc(log) +#define xlog_trace_loggrant_dealloc(log) #define xlog_trace_loggrant(log,tic,string) + +#define xlog_trace_iclog_alloc(iclog) +#define xlog_trace_iclog_dealloc(iclog) #define xlog_trace_iclog(iclog,state) + #endif /* XFS_LOG_TRACE */ @@ -226,20 +254,24 @@ xlog_grant_sub_space(struct log *log, int bytes) static void xlog_grant_add_space_write(struct log *log, int bytes) { - log->l_grant_write_bytes += bytes; - if (log->l_grant_write_bytes > log->l_logsize) { - log->l_grant_write_bytes -= log->l_logsize; + int tmp = log->l_logsize - log->l_grant_write_bytes; + if (tmp > bytes) + log->l_grant_write_bytes += bytes; + else { log->l_grant_write_cycle++; + log->l_grant_write_bytes = bytes - tmp; } } static void xlog_grant_add_space_reserve(struct log *log, int bytes) { - log->l_grant_reserve_bytes += bytes; - if (log->l_grant_reserve_bytes > log->l_logsize) { - log->l_grant_reserve_bytes -= log->l_logsize; + int tmp = log->l_logsize - log->l_grant_reserve_bytes; + if (tmp > bytes) + log->l_grant_reserve_bytes += bytes; + else { log->l_grant_reserve_cycle++; + log->l_grant_reserve_bytes = bytes - tmp; } } @@ -332,15 +364,12 @@ xfs_log_done(xfs_mount_t *mp, } else { xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); xlog_regrant_reserve_log_space(log, ticket); - } - - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) && - (flags & XFS_LOG_REL_PERM_RESERV) == 0) + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ ticket->t_flags |= XLOG_TIC_INITED; + } return lsn; } /* xfs_log_done */ @@ -353,11 +382,11 @@ xfs_log_done(xfs_mount_t *mp, * Asynchronous forces are implemented by setting the WANT_SYNC * bit in the appropriate in-core log and then returning. * - * Synchronous forces are implemented with a semaphore. All callers - * to force a given lsn to disk will wait on a semaphore attached to the + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the * specific in-core log. When given in-core log finally completes its * write to disk, that thread will wake up all threads waiting on the - * semaphore. + * sv. */ int _xfs_log_force( @@ -382,7 +411,27 @@ _xfs_log_force( return xlog_state_sync_all(log, flags, log_flushed); else return xlog_state_sync(log, lsn, flags, log_flushed); -} /* xfs_log_force */ +} /* _xfs_log_force */ + +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + error = _xfs_log_force(mp, lsn, flags, NULL); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " + "error %d returned.", error); + } +} + /* * Attaches a new iclog I/O completion callback routine during @@ -564,12 +613,12 @@ error: * mp - ubiquitous xfs mount point structure */ int -xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags) +xfs_log_mount_finish(xfs_mount_t *mp) { int error; if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - error = xlog_recover_finish(mp->m_log, mfsi_flags); + error = xlog_recover_finish(mp->m_log); else { error = 0; ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); @@ -634,7 +683,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; - xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC); + error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); + ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG first_iclog = iclog = log->l_iclog; @@ -676,13 +726,13 @@ xfs_log_unmount_write(xfs_mount_t *mp) atomic_inc(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog); spin_lock(&log->l_icloglock); if (!(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY)) { if (!XLOG_FORCED_SHUTDOWN(log)) { - sv_wait(&iclog->ic_forcesema, PMEM, + sv_wait(&iclog->ic_force_wait, PMEM, &log->l_icloglock, s); } else { spin_unlock(&log->l_icloglock); @@ -715,7 +765,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog); spin_lock(&log->l_icloglock); @@ -723,14 +773,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) || iclog->ic_state == XLOG_STATE_DIRTY || iclog->ic_state == XLOG_STATE_IOERROR) ) { - sv_wait(&iclog->ic_forcesema, PMEM, + sv_wait(&iclog->ic_force_wait, PMEM, &log->l_icloglock, s); } else { spin_unlock(&log->l_icloglock); } } - return 0; + return error; } /* xfs_log_unmount_write */ /* @@ -813,7 +863,7 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= tic->t_unit_res; - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_write_headq); } @@ -834,7 +884,7 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= need_bytes; - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_reserve_headq); } @@ -983,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp) l = iclog->ic_log; /* - * If the ordered flag has been removed by a lower - * layer, it means the underlyin device no longer supports + * If the _XFS_BARRIER_FAILED flag was set by a lower + * layer, it means the underlying device no longer supports * barrier I/O. Warn loudly and turn off barriers. */ - if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { + if (bp->b_flags & _XFS_BARRIER_FAILED) { + bp->b_flags &= ~_XFS_BARRIER_FAILED; l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; xfs_fs_cmn_err(CE_WARN, l->l_mp, "xlog_iodone: Barriers are no longer supported" @@ -1207,8 +1258,9 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_icloglock); spin_lock_init(&log->l_grant_lock); - initnsema(&log->l_flushsema, 0, "ic-flush"); + sv_init(&log->l_flush_wait, 0, "flush_wait"); + xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1260,8 +1312,10 @@ xlog_alloc_log(xfs_mount_t *mp, ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); - sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force"); - sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write"); + sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); + sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + + xlog_trace_iclog_alloc(iclog); iclogp = &iclog->ic_next; } @@ -1540,33 +1594,21 @@ xlog_dealloc_log(xlog_t *log) iclog = log->l_iclog; for (i=0; il_iclog_bufs; i++) { - sv_destroy(&iclog->ic_forcesema); - sv_destroy(&iclog->ic_writesema); + sv_destroy(&iclog->ic_force_wait); + sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif + xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; - kmem_free(iclog, sizeof(xlog_in_core_t)); + kmem_free(iclog); iclog = next_iclog; } - freesema(&log->l_flushsema); spinlock_destroy(&log->l_icloglock); spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif + xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; - kmem_free(log, sizeof(xlog_t)); + kmem_free(log); } /* xlog_dealloc_log */ /* @@ -1952,7 +1994,7 @@ xlog_write(xfs_mount_t * mp, /* Clean iclogs starting from the head. This ordering must be * maintained, so an iclog doesn't become ACTIVE beyond one that * is SYNCING. This is also required to maintain the notion that we use - * a counting semaphore to hold off would be writers to the log when every + * a ordered wait queue to hold off would be writers to the log when every * iclog is trying to sync to disk. * * State Change: DIRTY -> ACTIVE @@ -2076,6 +2118,7 @@ xlog_state_do_callback( int funcdidcallbacks; /* flag: function did callbacks */ int repeats; /* for issuing console warnings if * looping too many times */ + int wake = 0; spin_lock(&log->l_icloglock); first_iclog = iclog = log->l_iclog; @@ -2215,7 +2258,7 @@ xlog_state_do_callback( xlog_state_clean_log(log); /* wake up threads waiting in xfs_log_force() */ - sv_broadcast(&iclog->ic_forcesema); + sv_broadcast(&iclog->ic_force_wait); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2225,7 +2268,7 @@ xlog_state_do_callback( repeats = 0; xfs_fs_cmn_err(CE_WARN, log->l_mp, "%s: possible infinite loop (%d iterations)", - __FUNCTION__, flushcnt); + __func__, flushcnt); } } while (!ioerrors && loopdidcallbacks); @@ -2257,15 +2300,13 @@ xlog_state_do_callback( } #endif - flushcnt = 0; - if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) { - flushcnt = log->l_flushcnt; - log->l_flushcnt = 0; - } + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + wake = 1; spin_unlock(&log->l_icloglock); - while (flushcnt--) - vsema(&log->l_flushsema); -} /* xlog_state_do_callback */ + + if (wake) + sv_broadcast(&log->l_flush_wait); +} /* @@ -2279,8 +2320,7 @@ xlog_state_do_callback( * the second completion goes through. * * Callbacks could take time, so they are done outside the scope of the - * global state machine log lock. Assume that the calls to cvsema won't - * take a long time. At least we know it won't sleep. + * global state machine log lock. */ STATIC void xlog_state_done_syncing( @@ -2316,7 +2356,7 @@ xlog_state_done_syncing( * iclog buffer, we wake them all, one will get to do the * I/O, the others get to wait for the result. */ - sv_broadcast(&iclog->ic_writesema); + sv_broadcast(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ } /* xlog_state_done_syncing */ @@ -2324,11 +2364,9 @@ xlog_state_done_syncing( /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must - * sleep. The flush semaphore is set to the number of in-core buffers and - * decremented around disk syncing. Therefore, if all buffers are syncing, - * this semaphore will cause new writes to sleep until a sync completes. - * Otherwise, this code just does p() followed by v(). This approximates - * a sleep/wakeup except we can't race. + * sleep. We wait on the flush queue on the head iclog as that should be + * the first iclog to complete flushing. Hence if all iclogs are syncing, + * we will wait here and all new writes will sleep until a sync completes. * * The in-core logs are used in a circular fashion. They are not used * out-of-order even when an iclog past the head is free. @@ -2363,16 +2401,15 @@ restart: } iclog = log->l_iclog; - if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) { - log->l_flushcnt++; - spin_unlock(&log->l_icloglock); + if (iclog->ic_state != XLOG_STATE_ACTIVE) { xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); XFS_STATS_INC(xs_log_noiclogs); - /* Ensure that log writes happen */ - psema(&log->l_flushsema, PINOD); + + /* Wait for log writes to have flushed */ + sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); goto restart; } - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + head = &iclog->ic_header; atomic_inc(&iclog->ic_refcnt); /* prevents sync */ @@ -2406,13 +2443,20 @@ restart: if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - /* If I'm the only one writing to this iclog, sync it to disk */ - if (atomic_read(&iclog->ic_refcnt) == 1) { + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ spin_unlock(&log->l_icloglock); - if ((error = xlog_state_release_iclog(log, iclog))) + error = xlog_state_release_iclog(log, iclog); + if (error) return error; } else { - atomic_dec(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); } goto restart; @@ -2479,7 +2523,7 @@ xlog_grant_log_space(xlog_t *log, goto error_return; XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* * If we got an error, and the filesystem is shutting down, * we'll catch it down below. So just continue... @@ -2505,7 +2549,7 @@ redo: xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 2"); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); if (XLOG_FORCED_SHUTDOWN(log)) { spin_lock(&log->l_grant_lock); @@ -2604,7 +2648,7 @@ xlog_regrant_write_log_space(xlog_t *log, if (free_bytes < ntic->t_unit_res) break; free_bytes -= ntic->t_unit_res; - sv_signal(&ntic->t_sema); + sv_signal(&ntic->t_wait); ntic = ntic->t_next; } while (ntic != log->l_write_headq); @@ -2615,7 +2659,7 @@ xlog_regrant_write_log_space(xlog_t *log, xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: sleep 1"); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already @@ -2644,7 +2688,7 @@ redo: if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already off the queue */ if (XLOG_FORCED_SHUTDOWN(log)) { @@ -2887,7 +2931,7 @@ xlog_state_switch_iclogs(xlog_t *log, * 2. the current iclog is drity, and the previous iclog is in the * active or dirty state. * - * We may sleep (call psema) if: + * We may sleep if: * * 1. the current iclog is not in the active nor dirty state. * 2. the current iclog dirty, and the previous iclog is not in the @@ -2984,7 +3028,7 @@ maybe_sleep: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s); + sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3067,7 +3111,7 @@ try_again: XLOG_STATE_SYNCING))) { ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_writesema, PSWP, + sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, &log->l_icloglock, s); *log_flushed = 1; already_slept = 1; @@ -3087,7 +3131,7 @@ try_again: !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { /* - * Don't wait on the forcesema if we know that we've + * Don't wait on completion if we know that we've * gotten a log write error. */ if (iclog->ic_state & XLOG_STATE_IOERROR) { @@ -3095,7 +3139,7 @@ try_again: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s); + sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3151,7 +3195,7 @@ STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket) { - sv_destroy(&ticket->t_sema); + sv_destroy(&ticket->t_wait); kmem_zone_free(xfs_log_ticket_zone, ticket); } /* xlog_ticket_put */ @@ -3241,7 +3285,7 @@ xlog_ticket_get(xlog_t *log, tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_sema), SV_DEFAULT, "logtick"); + sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); xlog_tic_reset_res(tic); @@ -3528,14 +3572,14 @@ xfs_log_force_umount( */ if ((tic = log->l_reserve_headq)) { do { - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_reserve_headq); } if ((tic = log->l_write_headq)) { do { - sv_signal(&tic->t_sema); + sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_write_headq); }