dm snapshot: support barriers in snapshot merge target
[safe/jmp/linux-2.6] / drivers / md / dm-snap.c
1 /*
2  * dm-snapshot.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  *
6  * This file is released under the GPL.
7  */
8
9 #include <linux/blkdev.h>
10 #include <linux/device-mapper.h>
11 #include <linux/delay.h>
12 #include <linux/fs.h>
13 #include <linux/init.h>
14 #include <linux/kdev_t.h>
15 #include <linux/list.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <linux/log2.h>
21 #include <linux/dm-kcopyd.h>
22 #include <linux/workqueue.h>
23
24 #include "dm-exception-store.h"
25
26 #define DM_MSG_PREFIX "snapshots"
27
28 static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
29
30 #define dm_target_is_snapshot_merge(ti) \
31         ((ti)->type->name == dm_snapshot_merge_target_name)
32
33 /*
34  * The percentage increment we will wake up users at
35  */
36 #define WAKE_UP_PERCENT 5
37
38 /*
39  * kcopyd priority of snapshot operations
40  */
41 #define SNAPSHOT_COPY_PRIORITY 2
42
43 /*
44  * Reserve 1MB for each snapshot initially (with minimum of 1 page).
45  */
46 #define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
47
48 /*
49  * The size of the mempool used to track chunks in use.
50  */
51 #define MIN_IOS 256
52
53 #define DM_TRACKED_CHUNK_HASH_SIZE      16
54 #define DM_TRACKED_CHUNK_HASH(x)        ((unsigned long)(x) & \
55                                          (DM_TRACKED_CHUNK_HASH_SIZE - 1))
56
57 struct dm_exception_table {
58         uint32_t hash_mask;
59         unsigned hash_shift;
60         struct list_head *table;
61 };
62
63 struct dm_snapshot {
64         struct rw_semaphore lock;
65
66         struct dm_dev *origin;
67         struct dm_dev *cow;
68
69         struct dm_target *ti;
70
71         /* List of snapshots per Origin */
72         struct list_head list;
73
74         /* You can't use a snapshot if this is 0 (e.g. if full) */
75         int valid;
76
77         /* Origin writes don't trigger exceptions until this is set */
78         int active;
79
80         /* Whether or not owning mapped_device is suspended */
81         int suspended;
82
83         mempool_t *pending_pool;
84
85         atomic_t pending_exceptions_count;
86
87         struct dm_exception_table pending;
88         struct dm_exception_table complete;
89
90         /*
91          * pe_lock protects all pending_exception operations and access
92          * as well as the snapshot_bios list.
93          */
94         spinlock_t pe_lock;
95
96         /* The on disk metadata handler */
97         struct dm_exception_store *store;
98
99         struct dm_kcopyd_client *kcopyd_client;
100
101         /* Queue of snapshot writes for ksnapd to flush */
102         struct bio_list queued_bios;
103         struct work_struct queued_bios_work;
104
105         /* Chunks with outstanding reads */
106         mempool_t *tracked_chunk_pool;
107         spinlock_t tracked_chunk_lock;
108         struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
109 };
110
111 struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
112 {
113         return s->cow;
114 }
115 EXPORT_SYMBOL(dm_snap_cow);
116
117 static struct workqueue_struct *ksnapd;
118 static void flush_queued_bios(struct work_struct *work);
119
120 static sector_t chunk_to_sector(struct dm_exception_store *store,
121                                 chunk_t chunk)
122 {
123         return chunk << store->chunk_shift;
124 }
125
126 static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
127 {
128         /*
129          * There is only ever one instance of a particular block
130          * device so we can compare pointers safely.
131          */
132         return lhs == rhs;
133 }
134
135 struct dm_snap_pending_exception {
136         struct dm_exception e;
137
138         /*
139          * Origin buffers waiting for this to complete are held
140          * in a bio list
141          */
142         struct bio_list origin_bios;
143         struct bio_list snapshot_bios;
144
145         /* Pointer back to snapshot context */
146         struct dm_snapshot *snap;
147
148         /*
149          * 1 indicates the exception has already been sent to
150          * kcopyd.
151          */
152         int started;
153 };
154
155 /*
156  * Hash table mapping origin volumes to lists of snapshots and
157  * a lock to protect it
158  */
159 static struct kmem_cache *exception_cache;
160 static struct kmem_cache *pending_cache;
161
162 struct dm_snap_tracked_chunk {
163         struct hlist_node node;
164         chunk_t chunk;
165 };
166
167 static struct kmem_cache *tracked_chunk_cache;
168
169 static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
170                                                  chunk_t chunk)
171 {
172         struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
173                                                         GFP_NOIO);
174         unsigned long flags;
175
176         c->chunk = chunk;
177
178         spin_lock_irqsave(&s->tracked_chunk_lock, flags);
179         hlist_add_head(&c->node,
180                        &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
181         spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
182
183         return c;
184 }
185
186 static void stop_tracking_chunk(struct dm_snapshot *s,
187                                 struct dm_snap_tracked_chunk *c)
188 {
189         unsigned long flags;
190
191         spin_lock_irqsave(&s->tracked_chunk_lock, flags);
192         hlist_del(&c->node);
193         spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
194
195         mempool_free(c, s->tracked_chunk_pool);
196 }
197
198 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
199 {
200         struct dm_snap_tracked_chunk *c;
201         struct hlist_node *hn;
202         int found = 0;
203
204         spin_lock_irq(&s->tracked_chunk_lock);
205
206         hlist_for_each_entry(c, hn,
207             &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
208                 if (c->chunk == chunk) {
209                         found = 1;
210                         break;
211                 }
212         }
213
214         spin_unlock_irq(&s->tracked_chunk_lock);
215
216         return found;
217 }
218
219 /*
220  * This conflicting I/O is extremely improbable in the caller,
221  * so msleep(1) is sufficient and there is no need for a wait queue.
222  */
223 static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
224 {
225         while (__chunk_is_tracked(s, chunk))
226                 msleep(1);
227 }
228
229 /*
230  * One of these per registered origin, held in the snapshot_origins hash
231  */
232 struct origin {
233         /* The origin device */
234         struct block_device *bdev;
235
236         struct list_head hash_list;
237
238         /* List of snapshots for this origin */
239         struct list_head snapshots;
240 };
241
242 /*
243  * Size of the hash table for origin volumes. If we make this
244  * the size of the minors list then it should be nearly perfect
245  */
246 #define ORIGIN_HASH_SIZE 256
247 #define ORIGIN_MASK      0xFF
248 static struct list_head *_origins;
249 static struct rw_semaphore _origins_lock;
250
251 static int init_origin_hash(void)
252 {
253         int i;
254
255         _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
256                            GFP_KERNEL);
257         if (!_origins) {
258                 DMERR("unable to allocate memory");
259                 return -ENOMEM;
260         }
261
262         for (i = 0; i < ORIGIN_HASH_SIZE; i++)
263                 INIT_LIST_HEAD(_origins + i);
264         init_rwsem(&_origins_lock);
265
266         return 0;
267 }
268
269 static void exit_origin_hash(void)
270 {
271         kfree(_origins);
272 }
273
274 static unsigned origin_hash(struct block_device *bdev)
275 {
276         return bdev->bd_dev & ORIGIN_MASK;
277 }
278
279 static struct origin *__lookup_origin(struct block_device *origin)
280 {
281         struct list_head *ol;
282         struct origin *o;
283
284         ol = &_origins[origin_hash(origin)];
285         list_for_each_entry (o, ol, hash_list)
286                 if (bdev_equal(o->bdev, origin))
287                         return o;
288
289         return NULL;
290 }
291
292 static void __insert_origin(struct origin *o)
293 {
294         struct list_head *sl = &_origins[origin_hash(o->bdev)];
295         list_add_tail(&o->hash_list, sl);
296 }
297
298 /*
299  * _origins_lock must be held when calling this function.
300  * Returns number of snapshots registered using the supplied cow device, plus:
301  * snap_src - a snapshot suitable for use as a source of exception handover
302  * snap_dest - a snapshot capable of receiving exception handover.
303  *
304  * Possible return values and states:
305  *   0: NULL, NULL  - first new snapshot
306  *   1: snap_src, NULL - normal snapshot
307  *   2: snap_src, snap_dest  - waiting for handover
308  *   2: snap_src, NULL - handed over, waiting for old to be deleted
309  *   1: NULL, snap_dest - source got destroyed without handover
310  */
311 static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
312                                         struct dm_snapshot **snap_src,
313                                         struct dm_snapshot **snap_dest)
314 {
315         struct dm_snapshot *s;
316         struct origin *o;
317         int count = 0;
318         int active;
319
320         o = __lookup_origin(snap->origin->bdev);
321         if (!o)
322                 goto out;
323
324         list_for_each_entry(s, &o->snapshots, list) {
325                 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
326                         continue;
327
328                 down_read(&s->lock);
329                 active = s->active;
330                 up_read(&s->lock);
331
332                 if (active) {
333                         if (snap_src)
334                                 *snap_src = s;
335                 } else if (snap_dest)
336                         *snap_dest = s;
337
338                 count++;
339         }
340
341 out:
342         return count;
343 }
344
345 /*
346  * On success, returns 1 if this snapshot is a handover destination,
347  * otherwise returns 0.
348  */
349 static int __validate_exception_handover(struct dm_snapshot *snap)
350 {
351         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
352
353         /* Does snapshot need exceptions handed over to it? */
354         if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest) == 2) ||
355             snap_dest) {
356                 snap->ti->error = "Snapshot cow pairing for exception "
357                                   "table handover failed";
358                 return -EINVAL;
359         }
360
361         /*
362          * If no snap_src was found, snap cannot become a handover
363          * destination.
364          */
365         if (!snap_src)
366                 return 0;
367
368         return 1;
369 }
370
371 static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
372 {
373         struct dm_snapshot *l;
374
375         /* Sort the list according to chunk size, largest-first smallest-last */
376         list_for_each_entry(l, &o->snapshots, list)
377                 if (l->store->chunk_size < s->store->chunk_size)
378                         break;
379         list_add_tail(&s->list, &l->list);
380 }
381
382 /*
383  * Make a note of the snapshot and its origin so we can look it
384  * up when the origin has a write on it.
385  *
386  * Also validate snapshot exception store handovers.
387  * On success, returns 1 if this registration is a handover destination,
388  * otherwise returns 0.
389  */
390 static int register_snapshot(struct dm_snapshot *snap)
391 {
392         struct origin *o, *new_o = NULL;
393         struct block_device *bdev = snap->origin->bdev;
394         int r = 0;
395
396         new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
397         if (!new_o)
398                 return -ENOMEM;
399
400         down_write(&_origins_lock);
401
402         r = __validate_exception_handover(snap);
403         if (r < 0) {
404                 kfree(new_o);
405                 goto out;
406         }
407
408         o = __lookup_origin(bdev);
409         if (o)
410                 kfree(new_o);
411         else {
412                 /* New origin */
413                 o = new_o;
414
415                 /* Initialise the struct */
416                 INIT_LIST_HEAD(&o->snapshots);
417                 o->bdev = bdev;
418
419                 __insert_origin(o);
420         }
421
422         __insert_snapshot(o, snap);
423
424 out:
425         up_write(&_origins_lock);
426
427         return r;
428 }
429
430 /*
431  * Move snapshot to correct place in list according to chunk size.
432  */
433 static void reregister_snapshot(struct dm_snapshot *s)
434 {
435         struct block_device *bdev = s->origin->bdev;
436
437         down_write(&_origins_lock);
438
439         list_del(&s->list);
440         __insert_snapshot(__lookup_origin(bdev), s);
441
442         up_write(&_origins_lock);
443 }
444
445 static void unregister_snapshot(struct dm_snapshot *s)
446 {
447         struct origin *o;
448
449         down_write(&_origins_lock);
450         o = __lookup_origin(s->origin->bdev);
451
452         list_del(&s->list);
453         if (o && list_empty(&o->snapshots)) {
454                 list_del(&o->hash_list);
455                 kfree(o);
456         }
457
458         up_write(&_origins_lock);
459 }
460
461 /*
462  * Implementation of the exception hash tables.
463  * The lowest hash_shift bits of the chunk number are ignored, allowing
464  * some consecutive chunks to be grouped together.
465  */
466 static int dm_exception_table_init(struct dm_exception_table *et,
467                                    uint32_t size, unsigned hash_shift)
468 {
469         unsigned int i;
470
471         et->hash_shift = hash_shift;
472         et->hash_mask = size - 1;
473         et->table = dm_vcalloc(size, sizeof(struct list_head));
474         if (!et->table)
475                 return -ENOMEM;
476
477         for (i = 0; i < size; i++)
478                 INIT_LIST_HEAD(et->table + i);
479
480         return 0;
481 }
482
483 static void dm_exception_table_exit(struct dm_exception_table *et,
484                                     struct kmem_cache *mem)
485 {
486         struct list_head *slot;
487         struct dm_exception *ex, *next;
488         int i, size;
489
490         size = et->hash_mask + 1;
491         for (i = 0; i < size; i++) {
492                 slot = et->table + i;
493
494                 list_for_each_entry_safe (ex, next, slot, hash_list)
495                         kmem_cache_free(mem, ex);
496         }
497
498         vfree(et->table);
499 }
500
501 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
502 {
503         return (chunk >> et->hash_shift) & et->hash_mask;
504 }
505
506 static void dm_remove_exception(struct dm_exception *e)
507 {
508         list_del(&e->hash_list);
509 }
510
511 /*
512  * Return the exception data for a sector, or NULL if not
513  * remapped.
514  */
515 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
516                                                 chunk_t chunk)
517 {
518         struct list_head *slot;
519         struct dm_exception *e;
520
521         slot = &et->table[exception_hash(et, chunk)];
522         list_for_each_entry (e, slot, hash_list)
523                 if (chunk >= e->old_chunk &&
524                     chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
525                         return e;
526
527         return NULL;
528 }
529
530 static struct dm_exception *alloc_completed_exception(void)
531 {
532         struct dm_exception *e;
533
534         e = kmem_cache_alloc(exception_cache, GFP_NOIO);
535         if (!e)
536                 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
537
538         return e;
539 }
540
541 static void free_completed_exception(struct dm_exception *e)
542 {
543         kmem_cache_free(exception_cache, e);
544 }
545
546 static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
547 {
548         struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
549                                                              GFP_NOIO);
550
551         atomic_inc(&s->pending_exceptions_count);
552         pe->snap = s;
553
554         return pe;
555 }
556
557 static void free_pending_exception(struct dm_snap_pending_exception *pe)
558 {
559         struct dm_snapshot *s = pe->snap;
560
561         mempool_free(pe, s->pending_pool);
562         smp_mb__before_atomic_dec();
563         atomic_dec(&s->pending_exceptions_count);
564 }
565
566 static void dm_insert_exception(struct dm_exception_table *eh,
567                                 struct dm_exception *new_e)
568 {
569         struct list_head *l;
570         struct dm_exception *e = NULL;
571
572         l = &eh->table[exception_hash(eh, new_e->old_chunk)];
573
574         /* Add immediately if this table doesn't support consecutive chunks */
575         if (!eh->hash_shift)
576                 goto out;
577
578         /* List is ordered by old_chunk */
579         list_for_each_entry_reverse(e, l, hash_list) {
580                 /* Insert after an existing chunk? */
581                 if (new_e->old_chunk == (e->old_chunk +
582                                          dm_consecutive_chunk_count(e) + 1) &&
583                     new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
584                                          dm_consecutive_chunk_count(e) + 1)) {
585                         dm_consecutive_chunk_count_inc(e);
586                         free_completed_exception(new_e);
587                         return;
588                 }
589
590                 /* Insert before an existing chunk? */
591                 if (new_e->old_chunk == (e->old_chunk - 1) &&
592                     new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
593                         dm_consecutive_chunk_count_inc(e);
594                         e->old_chunk--;
595                         e->new_chunk--;
596                         free_completed_exception(new_e);
597                         return;
598                 }
599
600                 if (new_e->old_chunk > e->old_chunk)
601                         break;
602         }
603
604 out:
605         list_add(&new_e->hash_list, e ? &e->hash_list : l);
606 }
607
608 /*
609  * Callback used by the exception stores to load exceptions when
610  * initialising.
611  */
612 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
613 {
614         struct dm_snapshot *s = context;
615         struct dm_exception *e;
616
617         e = alloc_completed_exception();
618         if (!e)
619                 return -ENOMEM;
620
621         e->old_chunk = old;
622
623         /* Consecutive_count is implicitly initialised to zero */
624         e->new_chunk = new;
625
626         dm_insert_exception(&s->complete, e);
627
628         return 0;
629 }
630
631 #define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
632
633 /*
634  * Return a minimum chunk size of all snapshots that have the specified origin.
635  * Return zero if the origin has no snapshots.
636  */
637 static sector_t __minimum_chunk_size(struct origin *o)
638 {
639         struct dm_snapshot *snap;
640         unsigned chunk_size = 0;
641
642         if (o)
643                 list_for_each_entry(snap, &o->snapshots, list)
644                         chunk_size = min_not_zero(chunk_size,
645                                                   snap->store->chunk_size);
646
647         return chunk_size;
648 }
649
650 /*
651  * Hard coded magic.
652  */
653 static int calc_max_buckets(void)
654 {
655         /* use a fixed size of 2MB */
656         unsigned long mem = 2 * 1024 * 1024;
657         mem /= sizeof(struct list_head);
658
659         return mem;
660 }
661
662 /*
663  * Allocate room for a suitable hash table.
664  */
665 static int init_hash_tables(struct dm_snapshot *s)
666 {
667         sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
668
669         /*
670          * Calculate based on the size of the original volume or
671          * the COW volume...
672          */
673         cow_dev_size = get_dev_size(s->cow->bdev);
674         origin_dev_size = get_dev_size(s->origin->bdev);
675         max_buckets = calc_max_buckets();
676
677         hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
678         hash_size = min(hash_size, max_buckets);
679
680         if (hash_size < 64)
681                 hash_size = 64;
682         hash_size = rounddown_pow_of_two(hash_size);
683         if (dm_exception_table_init(&s->complete, hash_size,
684                                     DM_CHUNK_CONSECUTIVE_BITS))
685                 return -ENOMEM;
686
687         /*
688          * Allocate hash table for in-flight exceptions
689          * Make this smaller than the real hash table
690          */
691         hash_size >>= 3;
692         if (hash_size < 64)
693                 hash_size = 64;
694
695         if (dm_exception_table_init(&s->pending, hash_size, 0)) {
696                 dm_exception_table_exit(&s->complete, exception_cache);
697                 return -ENOMEM;
698         }
699
700         return 0;
701 }
702
703 /*
704  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
705  */
706 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
707 {
708         struct dm_snapshot *s;
709         int i;
710         int r = -EINVAL;
711         char *origin_path, *cow_path;
712         unsigned args_used, num_flush_requests = 1;
713         fmode_t origin_mode = FMODE_READ;
714
715         if (argc != 4) {
716                 ti->error = "requires exactly 4 arguments";
717                 r = -EINVAL;
718                 goto bad;
719         }
720
721         if (dm_target_is_snapshot_merge(ti)) {
722                 num_flush_requests = 2;
723                 origin_mode = FMODE_WRITE;
724         }
725
726         origin_path = argv[0];
727         argv++;
728         argc--;
729
730         s = kmalloc(sizeof(*s), GFP_KERNEL);
731         if (!s) {
732                 ti->error = "Cannot allocate snapshot context private "
733                     "structure";
734                 r = -ENOMEM;
735                 goto bad;
736         }
737
738         cow_path = argv[0];
739         argv++;
740         argc--;
741
742         r = dm_get_device(ti, cow_path, 0, 0,
743                           FMODE_READ | FMODE_WRITE, &s->cow);
744         if (r) {
745                 ti->error = "Cannot get COW device";
746                 goto bad_cow;
747         }
748
749         r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
750         if (r) {
751                 ti->error = "Couldn't create exception store";
752                 r = -EINVAL;
753                 goto bad_store;
754         }
755
756         argv += args_used;
757         argc -= args_used;
758
759         r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin);
760         if (r) {
761                 ti->error = "Cannot get origin device";
762                 goto bad_origin;
763         }
764
765         s->ti = ti;
766         s->valid = 1;
767         s->active = 0;
768         s->suspended = 0;
769         atomic_set(&s->pending_exceptions_count, 0);
770         init_rwsem(&s->lock);
771         INIT_LIST_HEAD(&s->list);
772         spin_lock_init(&s->pe_lock);
773
774         /* Allocate hash table for COW data */
775         if (init_hash_tables(s)) {
776                 ti->error = "Unable to allocate hash table space";
777                 r = -ENOMEM;
778                 goto bad_hash_tables;
779         }
780
781         r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
782         if (r) {
783                 ti->error = "Could not create kcopyd client";
784                 goto bad_kcopyd;
785         }
786
787         s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
788         if (!s->pending_pool) {
789                 ti->error = "Could not allocate mempool for pending exceptions";
790                 goto bad_pending_pool;
791         }
792
793         s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
794                                                          tracked_chunk_cache);
795         if (!s->tracked_chunk_pool) {
796                 ti->error = "Could not allocate tracked_chunk mempool for "
797                             "tracking reads";
798                 goto bad_tracked_chunk_pool;
799         }
800
801         for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
802                 INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
803
804         spin_lock_init(&s->tracked_chunk_lock);
805
806         bio_list_init(&s->queued_bios);
807         INIT_WORK(&s->queued_bios_work, flush_queued_bios);
808
809         ti->private = s;
810         ti->num_flush_requests = num_flush_requests;
811
812         /* Add snapshot to the list of snapshots for this origin */
813         /* Exceptions aren't triggered till snapshot_resume() is called */
814         r = register_snapshot(s);
815         if (r == -ENOMEM) {
816                 ti->error = "Snapshot origin struct allocation failed";
817                 goto bad_load_and_register;
818         } else if (r < 0) {
819                 /* invalid handover, register_snapshot has set ti->error */
820                 goto bad_load_and_register;
821         }
822
823         /*
824          * Metadata must only be loaded into one table at once, so skip this
825          * if metadata will be handed over during resume.
826          * Chunk size will be set during the handover - set it to zero to
827          * ensure it's ignored.
828          */
829         if (r > 0) {
830                 s->store->chunk_size = 0;
831                 return 0;
832         }
833
834         r = s->store->type->read_metadata(s->store, dm_add_exception,
835                                           (void *)s);
836         if (r < 0) {
837                 ti->error = "Failed to read snapshot metadata";
838                 goto bad_read_metadata;
839         } else if (r > 0) {
840                 s->valid = 0;
841                 DMWARN("Snapshot is marked invalid.");
842         }
843
844         if (!s->store->chunk_size) {
845                 ti->error = "Chunk size not set";
846                 goto bad_read_metadata;
847         }
848         ti->split_io = s->store->chunk_size;
849
850         return 0;
851
852 bad_read_metadata:
853         unregister_snapshot(s);
854
855 bad_load_and_register:
856         mempool_destroy(s->tracked_chunk_pool);
857
858 bad_tracked_chunk_pool:
859         mempool_destroy(s->pending_pool);
860
861 bad_pending_pool:
862         dm_kcopyd_client_destroy(s->kcopyd_client);
863
864 bad_kcopyd:
865         dm_exception_table_exit(&s->pending, pending_cache);
866         dm_exception_table_exit(&s->complete, exception_cache);
867
868 bad_hash_tables:
869         dm_put_device(ti, s->origin);
870
871 bad_origin:
872         dm_exception_store_destroy(s->store);
873
874 bad_store:
875         dm_put_device(ti, s->cow);
876
877 bad_cow:
878         kfree(s);
879
880 bad:
881         return r;
882 }
883
884 static void __free_exceptions(struct dm_snapshot *s)
885 {
886         dm_kcopyd_client_destroy(s->kcopyd_client);
887         s->kcopyd_client = NULL;
888
889         dm_exception_table_exit(&s->pending, pending_cache);
890         dm_exception_table_exit(&s->complete, exception_cache);
891 }
892
893 static void __handover_exceptions(struct dm_snapshot *snap_src,
894                                   struct dm_snapshot *snap_dest)
895 {
896         union {
897                 struct dm_exception_table table_swap;
898                 struct dm_exception_store *store_swap;
899         } u;
900
901         /*
902          * Swap all snapshot context information between the two instances.
903          */
904         u.table_swap = snap_dest->complete;
905         snap_dest->complete = snap_src->complete;
906         snap_src->complete = u.table_swap;
907
908         u.store_swap = snap_dest->store;
909         snap_dest->store = snap_src->store;
910         snap_src->store = u.store_swap;
911
912         snap_dest->store->snap = snap_dest;
913         snap_src->store->snap = snap_src;
914
915         snap_dest->ti->split_io = snap_dest->store->chunk_size;
916         snap_dest->valid = snap_src->valid;
917
918         /*
919          * Set source invalid to ensure it receives no further I/O.
920          */
921         snap_src->valid = 0;
922 }
923
924 static void snapshot_dtr(struct dm_target *ti)
925 {
926 #ifdef CONFIG_DM_DEBUG
927         int i;
928 #endif
929         struct dm_snapshot *s = ti->private;
930         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
931
932         flush_workqueue(ksnapd);
933
934         down_read(&_origins_lock);
935         /* Check whether exception handover must be cancelled */
936         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
937         if (snap_src && snap_dest && (s == snap_src)) {
938                 down_write(&snap_dest->lock);
939                 snap_dest->valid = 0;
940                 up_write(&snap_dest->lock);
941                 DMERR("Cancelling snapshot handover.");
942         }
943         up_read(&_origins_lock);
944
945         /* Prevent further origin writes from using this snapshot. */
946         /* After this returns there can be no new kcopyd jobs. */
947         unregister_snapshot(s);
948
949         while (atomic_read(&s->pending_exceptions_count))
950                 msleep(1);
951         /*
952          * Ensure instructions in mempool_destroy aren't reordered
953          * before atomic_read.
954          */
955         smp_mb();
956
957 #ifdef CONFIG_DM_DEBUG
958         for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
959                 BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
960 #endif
961
962         mempool_destroy(s->tracked_chunk_pool);
963
964         __free_exceptions(s);
965
966         mempool_destroy(s->pending_pool);
967
968         dm_put_device(ti, s->origin);
969
970         dm_exception_store_destroy(s->store);
971
972         dm_put_device(ti, s->cow);
973
974         kfree(s);
975 }
976
977 /*
978  * Flush a list of buffers.
979  */
980 static void flush_bios(struct bio *bio)
981 {
982         struct bio *n;
983
984         while (bio) {
985                 n = bio->bi_next;
986                 bio->bi_next = NULL;
987                 generic_make_request(bio);
988                 bio = n;
989         }
990 }
991
992 static void flush_queued_bios(struct work_struct *work)
993 {
994         struct dm_snapshot *s =
995                 container_of(work, struct dm_snapshot, queued_bios_work);
996         struct bio *queued_bios;
997         unsigned long flags;
998
999         spin_lock_irqsave(&s->pe_lock, flags);
1000         queued_bios = bio_list_get(&s->queued_bios);
1001         spin_unlock_irqrestore(&s->pe_lock, flags);
1002
1003         flush_bios(queued_bios);
1004 }
1005
1006 static int do_origin(struct dm_dev *origin, struct bio *bio);
1007
1008 /*
1009  * Flush a list of buffers.
1010  */
1011 static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1012 {
1013         struct bio *n;
1014         int r;
1015
1016         while (bio) {
1017                 n = bio->bi_next;
1018                 bio->bi_next = NULL;
1019                 r = do_origin(s->origin, bio);
1020                 if (r == DM_MAPIO_REMAPPED)
1021                         generic_make_request(bio);
1022                 bio = n;
1023         }
1024 }
1025
1026 /*
1027  * Error a list of buffers.
1028  */
1029 static void error_bios(struct bio *bio)
1030 {
1031         struct bio *n;
1032
1033         while (bio) {
1034                 n = bio->bi_next;
1035                 bio->bi_next = NULL;
1036                 bio_io_error(bio);
1037                 bio = n;
1038         }
1039 }
1040
1041 static void __invalidate_snapshot(struct dm_snapshot *s, int err)
1042 {
1043         if (!s->valid)
1044                 return;
1045
1046         if (err == -EIO)
1047                 DMERR("Invalidating snapshot: Error reading/writing.");
1048         else if (err == -ENOMEM)
1049                 DMERR("Invalidating snapshot: Unable to allocate exception.");
1050
1051         if (s->store->type->drop_snapshot)
1052                 s->store->type->drop_snapshot(s->store);
1053
1054         s->valid = 0;
1055
1056         dm_table_event(s->ti->table);
1057 }
1058
1059 static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1060 {
1061         struct dm_exception *e;
1062         struct dm_snapshot *s = pe->snap;
1063         struct bio *origin_bios = NULL;
1064         struct bio *snapshot_bios = NULL;
1065         int error = 0;
1066
1067         if (!success) {
1068                 /* Read/write error - snapshot is unusable */
1069                 down_write(&s->lock);
1070                 __invalidate_snapshot(s, -EIO);
1071                 error = 1;
1072                 goto out;
1073         }
1074
1075         e = alloc_completed_exception();
1076         if (!e) {
1077                 down_write(&s->lock);
1078                 __invalidate_snapshot(s, -ENOMEM);
1079                 error = 1;
1080                 goto out;
1081         }
1082         *e = pe->e;
1083
1084         down_write(&s->lock);
1085         if (!s->valid) {
1086                 free_completed_exception(e);
1087                 error = 1;
1088                 goto out;
1089         }
1090
1091         /* Check for conflicting reads */
1092         __check_for_conflicting_io(s, pe->e.old_chunk);
1093
1094         /*
1095          * Add a proper exception, and remove the
1096          * in-flight exception from the list.
1097          */
1098         dm_insert_exception(&s->complete, e);
1099
1100  out:
1101         dm_remove_exception(&pe->e);
1102         snapshot_bios = bio_list_get(&pe->snapshot_bios);
1103         origin_bios = bio_list_get(&pe->origin_bios);
1104         free_pending_exception(pe);
1105
1106         up_write(&s->lock);
1107
1108         /* Submit any pending write bios */
1109         if (error)
1110                 error_bios(snapshot_bios);
1111         else
1112                 flush_bios(snapshot_bios);
1113
1114         retry_origin_bios(s, origin_bios);
1115 }
1116
1117 static void commit_callback(void *context, int success)
1118 {
1119         struct dm_snap_pending_exception *pe = context;
1120
1121         pending_complete(pe, success);
1122 }
1123
1124 /*
1125  * Called when the copy I/O has finished.  kcopyd actually runs
1126  * this code so don't block.
1127  */
1128 static void copy_callback(int read_err, unsigned long write_err, void *context)
1129 {
1130         struct dm_snap_pending_exception *pe = context;
1131         struct dm_snapshot *s = pe->snap;
1132
1133         if (read_err || write_err)
1134                 pending_complete(pe, 0);
1135
1136         else
1137                 /* Update the metadata if we are persistent */
1138                 s->store->type->commit_exception(s->store, &pe->e,
1139                                                  commit_callback, pe);
1140 }
1141
1142 /*
1143  * Dispatches the copy operation to kcopyd.
1144  */
1145 static void start_copy(struct dm_snap_pending_exception *pe)
1146 {
1147         struct dm_snapshot *s = pe->snap;
1148         struct dm_io_region src, dest;
1149         struct block_device *bdev = s->origin->bdev;
1150         sector_t dev_size;
1151
1152         dev_size = get_dev_size(bdev);
1153
1154         src.bdev = bdev;
1155         src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
1156         src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
1157
1158         dest.bdev = s->cow->bdev;
1159         dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
1160         dest.count = src.count;
1161
1162         /* Hand over to kcopyd */
1163         dm_kcopyd_copy(s->kcopyd_client,
1164                     &src, 1, &dest, 0, copy_callback, pe);
1165 }
1166
1167 static struct dm_snap_pending_exception *
1168 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
1169 {
1170         struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
1171
1172         if (!e)
1173                 return NULL;
1174
1175         return container_of(e, struct dm_snap_pending_exception, e);
1176 }
1177
1178 /*
1179  * Looks to see if this snapshot already has a pending exception
1180  * for this chunk, otherwise it allocates a new one and inserts
1181  * it into the pending table.
1182  *
1183  * NOTE: a write lock must be held on snap->lock before calling
1184  * this.
1185  */
1186 static struct dm_snap_pending_exception *
1187 __find_pending_exception(struct dm_snapshot *s,
1188                          struct dm_snap_pending_exception *pe, chunk_t chunk)
1189 {
1190         struct dm_snap_pending_exception *pe2;
1191
1192         pe2 = __lookup_pending_exception(s, chunk);
1193         if (pe2) {
1194                 free_pending_exception(pe);
1195                 return pe2;
1196         }
1197
1198         pe->e.old_chunk = chunk;
1199         bio_list_init(&pe->origin_bios);
1200         bio_list_init(&pe->snapshot_bios);
1201         pe->started = 0;
1202
1203         if (s->store->type->prepare_exception(s->store, &pe->e)) {
1204                 free_pending_exception(pe);
1205                 return NULL;
1206         }
1207
1208         dm_insert_exception(&s->pending, &pe->e);
1209
1210         return pe;
1211 }
1212
1213 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1214                             struct bio *bio, chunk_t chunk)
1215 {
1216         bio->bi_bdev = s->cow->bdev;
1217         bio->bi_sector = chunk_to_sector(s->store,
1218                                          dm_chunk_number(e->new_chunk) +
1219                                          (chunk - e->old_chunk)) +
1220                                          (bio->bi_sector &
1221                                           s->store->chunk_mask);
1222 }
1223
1224 static int snapshot_map(struct dm_target *ti, struct bio *bio,
1225                         union map_info *map_context)
1226 {
1227         struct dm_exception *e;
1228         struct dm_snapshot *s = ti->private;
1229         int r = DM_MAPIO_REMAPPED;
1230         chunk_t chunk;
1231         struct dm_snap_pending_exception *pe = NULL;
1232
1233         if (unlikely(bio_empty_barrier(bio))) {
1234                 bio->bi_bdev = s->cow->bdev;
1235                 return DM_MAPIO_REMAPPED;
1236         }
1237
1238         chunk = sector_to_chunk(s->store, bio->bi_sector);
1239
1240         /* Full snapshots are not usable */
1241         /* To get here the table must be live so s->active is always set. */
1242         if (!s->valid)
1243                 return -EIO;
1244
1245         /* FIXME: should only take write lock if we need
1246          * to copy an exception */
1247         down_write(&s->lock);
1248
1249         if (!s->valid) {
1250                 r = -EIO;
1251                 goto out_unlock;
1252         }
1253
1254         /* If the block is already remapped - use that, else remap it */
1255         e = dm_lookup_exception(&s->complete, chunk);
1256         if (e) {
1257                 remap_exception(s, e, bio, chunk);
1258                 goto out_unlock;
1259         }
1260
1261         /*
1262          * Write to snapshot - higher level takes care of RW/RO
1263          * flags so we should only get this if we are
1264          * writeable.
1265          */
1266         if (bio_rw(bio) == WRITE) {
1267                 pe = __lookup_pending_exception(s, chunk);
1268                 if (!pe) {
1269                         up_write(&s->lock);
1270                         pe = alloc_pending_exception(s);
1271                         down_write(&s->lock);
1272
1273                         if (!s->valid) {
1274                                 free_pending_exception(pe);
1275                                 r = -EIO;
1276                                 goto out_unlock;
1277                         }
1278
1279                         e = dm_lookup_exception(&s->complete, chunk);
1280                         if (e) {
1281                                 free_pending_exception(pe);
1282                                 remap_exception(s, e, bio, chunk);
1283                                 goto out_unlock;
1284                         }
1285
1286                         pe = __find_pending_exception(s, pe, chunk);
1287                         if (!pe) {
1288                                 __invalidate_snapshot(s, -ENOMEM);
1289                                 r = -EIO;
1290                                 goto out_unlock;
1291                         }
1292                 }
1293
1294                 remap_exception(s, &pe->e, bio, chunk);
1295                 bio_list_add(&pe->snapshot_bios, bio);
1296
1297                 r = DM_MAPIO_SUBMITTED;
1298
1299                 if (!pe->started) {
1300                         /* this is protected by snap->lock */
1301                         pe->started = 1;
1302                         up_write(&s->lock);
1303                         start_copy(pe);
1304                         goto out;
1305                 }
1306         } else {
1307                 bio->bi_bdev = s->origin->bdev;
1308                 map_context->ptr = track_chunk(s, chunk);
1309         }
1310
1311  out_unlock:
1312         up_write(&s->lock);
1313  out:
1314         return r;
1315 }
1316
1317 /*
1318  * A snapshot-merge target behaves like a combination of a snapshot
1319  * target and a snapshot-origin target.  It only generates new
1320  * exceptions in other snapshots and not in the one that is being
1321  * merged.
1322  *
1323  * For each chunk, if there is an existing exception, it is used to
1324  * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
1325  * which in turn might generate exceptions in other snapshots.
1326  */
1327 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1328                               union map_info *map_context)
1329 {
1330         struct dm_exception *e;
1331         struct dm_snapshot *s = ti->private;
1332         int r = DM_MAPIO_REMAPPED;
1333         chunk_t chunk;
1334
1335         if (unlikely(bio_empty_barrier(bio))) {
1336                 if (!map_context->flush_request)
1337                         bio->bi_bdev = s->origin->bdev;
1338                 else
1339                         bio->bi_bdev = s->cow->bdev;
1340                 map_context->ptr = NULL;
1341                 return DM_MAPIO_REMAPPED;
1342         }
1343
1344         chunk = sector_to_chunk(s->store, bio->bi_sector);
1345
1346         down_read(&s->lock);
1347
1348         /* Full snapshots are not usable */
1349         if (!s->valid) {
1350                 r = -EIO;
1351                 goto out_unlock;
1352         }
1353
1354         /* If the block is already remapped - use that */
1355         e = dm_lookup_exception(&s->complete, chunk);
1356         if (e) {
1357                 remap_exception(s, e, bio, chunk);
1358                 goto out_unlock;
1359         }
1360
1361         bio->bi_bdev = s->origin->bdev;
1362
1363         if (bio_rw(bio) == WRITE) {
1364                 up_read(&s->lock);
1365                 return do_origin(s->origin, bio);
1366         }
1367
1368 out_unlock:
1369         up_read(&s->lock);
1370
1371         return r;
1372 }
1373
1374 static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1375                            int error, union map_info *map_context)
1376 {
1377         struct dm_snapshot *s = ti->private;
1378         struct dm_snap_tracked_chunk *c = map_context->ptr;
1379
1380         if (c)
1381                 stop_tracking_chunk(s, c);
1382
1383         return 0;
1384 }
1385
1386 static void snapshot_postsuspend(struct dm_target *ti)
1387 {
1388         struct dm_snapshot *s = ti->private;
1389
1390         down_write(&s->lock);
1391         s->suspended = 1;
1392         up_write(&s->lock);
1393 }
1394
1395 static int snapshot_preresume(struct dm_target *ti)
1396 {
1397         int r = 0;
1398         struct dm_snapshot *s = ti->private;
1399         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1400
1401         down_read(&_origins_lock);
1402         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
1403         if (snap_src && snap_dest) {
1404                 down_read(&snap_src->lock);
1405                 if (s == snap_src) {
1406                         DMERR("Unable to resume snapshot source until "
1407                               "handover completes.");
1408                         r = -EINVAL;
1409                 } else if (!snap_src->suspended) {
1410                         DMERR("Unable to perform snapshot handover until "
1411                               "source is suspended.");
1412                         r = -EINVAL;
1413                 }
1414                 up_read(&snap_src->lock);
1415         }
1416         up_read(&_origins_lock);
1417
1418         return r;
1419 }
1420
1421 static void snapshot_resume(struct dm_target *ti)
1422 {
1423         struct dm_snapshot *s = ti->private;
1424         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1425
1426         down_read(&_origins_lock);
1427         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
1428         if (snap_src && snap_dest) {
1429                 down_write(&snap_src->lock);
1430                 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
1431                 __handover_exceptions(snap_src, snap_dest);
1432                 up_write(&snap_dest->lock);
1433                 up_write(&snap_src->lock);
1434         }
1435         up_read(&_origins_lock);
1436
1437         /* Now we have correct chunk size, reregister */
1438         reregister_snapshot(s);
1439
1440         down_write(&s->lock);
1441         s->active = 1;
1442         s->suspended = 0;
1443         up_write(&s->lock);
1444 }
1445
1446 static int snapshot_status(struct dm_target *ti, status_type_t type,
1447                            char *result, unsigned int maxlen)
1448 {
1449         unsigned sz = 0;
1450         struct dm_snapshot *snap = ti->private;
1451
1452         switch (type) {
1453         case STATUSTYPE_INFO:
1454
1455                 down_write(&snap->lock);
1456
1457                 if (!snap->valid)
1458                         DMEMIT("Invalid");
1459                 else {
1460                         if (snap->store->type->usage) {
1461                                 sector_t total_sectors, sectors_allocated,
1462                                          metadata_sectors;
1463                                 snap->store->type->usage(snap->store,
1464                                                          &total_sectors,
1465                                                          &sectors_allocated,
1466                                                          &metadata_sectors);
1467                                 DMEMIT("%llu/%llu %llu",
1468                                        (unsigned long long)sectors_allocated,
1469                                        (unsigned long long)total_sectors,
1470                                        (unsigned long long)metadata_sectors);
1471                         }
1472                         else
1473                                 DMEMIT("Unknown");
1474                 }
1475
1476                 up_write(&snap->lock);
1477
1478                 break;
1479
1480         case STATUSTYPE_TABLE:
1481                 /*
1482                  * kdevname returns a static pointer so we need
1483                  * to make private copies if the output is to
1484                  * make sense.
1485                  */
1486                 DMEMIT("%s %s", snap->origin->name, snap->cow->name);
1487                 snap->store->type->status(snap->store, type, result + sz,
1488                                           maxlen - sz);
1489                 break;
1490         }
1491
1492         return 0;
1493 }
1494
1495 static int snapshot_iterate_devices(struct dm_target *ti,
1496                                     iterate_devices_callout_fn fn, void *data)
1497 {
1498         struct dm_snapshot *snap = ti->private;
1499
1500         return fn(ti, snap->origin, 0, ti->len, data);
1501 }
1502
1503
1504 /*-----------------------------------------------------------------
1505  * Origin methods
1506  *---------------------------------------------------------------*/
1507
1508 /*
1509  * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
1510  * supplied bio was ignored.  The caller may submit it immediately.
1511  * (No remapping actually occurs as the origin is always a direct linear
1512  * map.)
1513  *
1514  * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
1515  * and any supplied bio is added to a list to be submitted once all
1516  * the necessary exceptions exist.
1517  */
1518 static int __origin_write(struct list_head *snapshots, sector_t sector,
1519                           struct bio *bio)
1520 {
1521         int r = DM_MAPIO_REMAPPED;
1522         struct dm_snapshot *snap;
1523         struct dm_exception *e;
1524         struct dm_snap_pending_exception *pe;
1525         struct dm_snap_pending_exception *pe_to_start_now = NULL;
1526         struct dm_snap_pending_exception *pe_to_start_last = NULL;
1527         chunk_t chunk;
1528
1529         /* Do all the snapshots on this origin */
1530         list_for_each_entry (snap, snapshots, list) {
1531                 /*
1532                  * Don't make new exceptions in a merging snapshot
1533                  * because it has effectively been deleted
1534                  */
1535                 if (dm_target_is_snapshot_merge(snap->ti))
1536                         continue;
1537
1538                 down_write(&snap->lock);
1539
1540                 /* Only deal with valid and active snapshots */
1541                 if (!snap->valid || !snap->active)
1542                         goto next_snapshot;
1543
1544                 /* Nothing to do if writing beyond end of snapshot */
1545                 if (sector >= dm_table_get_size(snap->ti->table))
1546                         goto next_snapshot;
1547
1548                 /*
1549                  * Remember, different snapshots can have
1550                  * different chunk sizes.
1551                  */
1552                 chunk = sector_to_chunk(snap->store, sector);
1553
1554                 /*
1555                  * Check exception table to see if block
1556                  * is already remapped in this snapshot
1557                  * and trigger an exception if not.
1558                  */
1559                 e = dm_lookup_exception(&snap->complete, chunk);
1560                 if (e)
1561                         goto next_snapshot;
1562
1563                 pe = __lookup_pending_exception(snap, chunk);
1564                 if (!pe) {
1565                         up_write(&snap->lock);
1566                         pe = alloc_pending_exception(snap);
1567                         down_write(&snap->lock);
1568
1569                         if (!snap->valid) {
1570                                 free_pending_exception(pe);
1571                                 goto next_snapshot;
1572                         }
1573
1574                         e = dm_lookup_exception(&snap->complete, chunk);
1575                         if (e) {
1576                                 free_pending_exception(pe);
1577                                 goto next_snapshot;
1578                         }
1579
1580                         pe = __find_pending_exception(snap, pe, chunk);
1581                         if (!pe) {
1582                                 __invalidate_snapshot(snap, -ENOMEM);
1583                                 goto next_snapshot;
1584                         }
1585                 }
1586
1587                 r = DM_MAPIO_SUBMITTED;
1588
1589                 /*
1590                  * If an origin bio was supplied, queue it to wait for the
1591                  * completion of this exception, and start this one last,
1592                  * at the end of the function.
1593                  */
1594                 if (bio) {
1595                         bio_list_add(&pe->origin_bios, bio);
1596                         bio = NULL;
1597
1598                         if (!pe->started) {
1599                                 pe->started = 1;
1600                                 pe_to_start_last = pe;
1601                         }
1602                 }
1603
1604                 if (!pe->started) {
1605                         pe->started = 1;
1606                         pe_to_start_now = pe;
1607                 }
1608
1609  next_snapshot:
1610                 up_write(&snap->lock);
1611
1612                 if (pe_to_start_now) {
1613                         start_copy(pe_to_start_now);
1614                         pe_to_start_now = NULL;
1615                 }
1616         }
1617
1618         /*
1619          * Submit the exception against which the bio is queued last,
1620          * to give the other exceptions a head start.
1621          */
1622         if (pe_to_start_last)
1623                 start_copy(pe_to_start_last);
1624
1625         return r;
1626 }
1627
1628 /*
1629  * Called on a write from the origin driver.
1630  */
1631 static int do_origin(struct dm_dev *origin, struct bio *bio)
1632 {
1633         struct origin *o;
1634         int r = DM_MAPIO_REMAPPED;
1635
1636         down_read(&_origins_lock);
1637         o = __lookup_origin(origin->bdev);
1638         if (o)
1639                 r = __origin_write(&o->snapshots, bio->bi_sector, bio);
1640         up_read(&_origins_lock);
1641
1642         return r;
1643 }
1644
1645 /*
1646  * Origin: maps a linear range of a device, with hooks for snapshotting.
1647  */
1648
1649 /*
1650  * Construct an origin mapping: <dev_path>
1651  * The context for an origin is merely a 'struct dm_dev *'
1652  * pointing to the real device.
1653  */
1654 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1655 {
1656         int r;
1657         struct dm_dev *dev;
1658
1659         if (argc != 1) {
1660                 ti->error = "origin: incorrect number of arguments";
1661                 return -EINVAL;
1662         }
1663
1664         r = dm_get_device(ti, argv[0], 0, ti->len,
1665                           dm_table_get_mode(ti->table), &dev);
1666         if (r) {
1667                 ti->error = "Cannot get target device";
1668                 return r;
1669         }
1670
1671         ti->private = dev;
1672         ti->num_flush_requests = 1;
1673
1674         return 0;
1675 }
1676
1677 static void origin_dtr(struct dm_target *ti)
1678 {
1679         struct dm_dev *dev = ti->private;
1680         dm_put_device(ti, dev);
1681 }
1682
1683 static int origin_map(struct dm_target *ti, struct bio *bio,
1684                       union map_info *map_context)
1685 {
1686         struct dm_dev *dev = ti->private;
1687         bio->bi_bdev = dev->bdev;
1688
1689         if (unlikely(bio_empty_barrier(bio)))
1690                 return DM_MAPIO_REMAPPED;
1691
1692         /* Only tell snapshots if this is a write */
1693         return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1694 }
1695
1696 /*
1697  * Set the target "split_io" field to the minimum of all the snapshots'
1698  * chunk sizes.
1699  */
1700 static void origin_resume(struct dm_target *ti)
1701 {
1702         struct dm_dev *dev = ti->private;
1703
1704         down_read(&_origins_lock);
1705
1706         ti->split_io = __minimum_chunk_size(__lookup_origin(dev->bdev));
1707
1708         up_read(&_origins_lock);
1709 }
1710
1711 static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1712                          unsigned int maxlen)
1713 {
1714         struct dm_dev *dev = ti->private;
1715
1716         switch (type) {
1717         case STATUSTYPE_INFO:
1718                 result[0] = '\0';
1719                 break;
1720
1721         case STATUSTYPE_TABLE:
1722                 snprintf(result, maxlen, "%s", dev->name);
1723                 break;
1724         }
1725
1726         return 0;
1727 }
1728
1729 static int origin_iterate_devices(struct dm_target *ti,
1730                                   iterate_devices_callout_fn fn, void *data)
1731 {
1732         struct dm_dev *dev = ti->private;
1733
1734         return fn(ti, dev, 0, ti->len, data);
1735 }
1736
1737 static struct target_type origin_target = {
1738         .name    = "snapshot-origin",
1739         .version = {1, 7, 0},
1740         .module  = THIS_MODULE,
1741         .ctr     = origin_ctr,
1742         .dtr     = origin_dtr,
1743         .map     = origin_map,
1744         .resume  = origin_resume,
1745         .status  = origin_status,
1746         .iterate_devices = origin_iterate_devices,
1747 };
1748
1749 static struct target_type snapshot_target = {
1750         .name    = "snapshot",
1751         .version = {1, 9, 0},
1752         .module  = THIS_MODULE,
1753         .ctr     = snapshot_ctr,
1754         .dtr     = snapshot_dtr,
1755         .map     = snapshot_map,
1756         .end_io  = snapshot_end_io,
1757         .postsuspend = snapshot_postsuspend,
1758         .preresume  = snapshot_preresume,
1759         .resume  = snapshot_resume,
1760         .status  = snapshot_status,
1761         .iterate_devices = snapshot_iterate_devices,
1762 };
1763
1764 static struct target_type merge_target = {
1765         .name    = dm_snapshot_merge_target_name,
1766         .version = {1, 0, 0},
1767         .module  = THIS_MODULE,
1768         .ctr     = snapshot_ctr,
1769         .dtr     = snapshot_dtr,
1770         .map     = snapshot_merge_map,
1771         .end_io  = snapshot_end_io,
1772         .postsuspend = snapshot_postsuspend,
1773         .preresume  = snapshot_preresume,
1774         .resume  = snapshot_resume,
1775         .status  = snapshot_status,
1776         .iterate_devices = snapshot_iterate_devices,
1777 };
1778
1779 static int __init dm_snapshot_init(void)
1780 {
1781         int r;
1782
1783         r = dm_exception_store_init();
1784         if (r) {
1785                 DMERR("Failed to initialize exception stores");
1786                 return r;
1787         }
1788
1789         r = dm_register_target(&snapshot_target);
1790         if (r < 0) {
1791                 DMERR("snapshot target register failed %d", r);
1792                 goto bad_register_snapshot_target;
1793         }
1794
1795         r = dm_register_target(&origin_target);
1796         if (r < 0) {
1797                 DMERR("Origin target register failed %d", r);
1798                 goto bad_register_origin_target;
1799         }
1800
1801         r = dm_register_target(&merge_target);
1802         if (r < 0) {
1803                 DMERR("Merge target register failed %d", r);
1804                 goto bad_register_merge_target;
1805         }
1806
1807         r = init_origin_hash();
1808         if (r) {
1809                 DMERR("init_origin_hash failed.");
1810                 goto bad_origin_hash;
1811         }
1812
1813         exception_cache = KMEM_CACHE(dm_exception, 0);
1814         if (!exception_cache) {
1815                 DMERR("Couldn't create exception cache.");
1816                 r = -ENOMEM;
1817                 goto bad_exception_cache;
1818         }
1819
1820         pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
1821         if (!pending_cache) {
1822                 DMERR("Couldn't create pending cache.");
1823                 r = -ENOMEM;
1824                 goto bad_pending_cache;
1825         }
1826
1827         tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
1828         if (!tracked_chunk_cache) {
1829                 DMERR("Couldn't create cache to track chunks in use.");
1830                 r = -ENOMEM;
1831                 goto bad_tracked_chunk_cache;
1832         }
1833
1834         ksnapd = create_singlethread_workqueue("ksnapd");
1835         if (!ksnapd) {
1836                 DMERR("Failed to create ksnapd workqueue.");
1837                 r = -ENOMEM;
1838                 goto bad_pending_pool;
1839         }
1840
1841         return 0;
1842
1843 bad_pending_pool:
1844         kmem_cache_destroy(tracked_chunk_cache);
1845 bad_tracked_chunk_cache:
1846         kmem_cache_destroy(pending_cache);
1847 bad_pending_cache:
1848         kmem_cache_destroy(exception_cache);
1849 bad_exception_cache:
1850         exit_origin_hash();
1851 bad_origin_hash:
1852         dm_unregister_target(&merge_target);
1853 bad_register_merge_target:
1854         dm_unregister_target(&origin_target);
1855 bad_register_origin_target:
1856         dm_unregister_target(&snapshot_target);
1857 bad_register_snapshot_target:
1858         dm_exception_store_exit();
1859
1860         return r;
1861 }
1862
1863 static void __exit dm_snapshot_exit(void)
1864 {
1865         destroy_workqueue(ksnapd);
1866
1867         dm_unregister_target(&snapshot_target);
1868         dm_unregister_target(&origin_target);
1869         dm_unregister_target(&merge_target);
1870
1871         exit_origin_hash();
1872         kmem_cache_destroy(pending_cache);
1873         kmem_cache_destroy(exception_cache);
1874         kmem_cache_destroy(tracked_chunk_cache);
1875
1876         dm_exception_store_exit();
1877 }
1878
1879 /* Module hooks */
1880 module_init(dm_snapshot_init);
1881 module_exit(dm_snapshot_exit);
1882
1883 MODULE_DESCRIPTION(DM_NAME " snapshot target");
1884 MODULE_AUTHOR("Joe Thornber");
1885 MODULE_LICENSE("GPL");