]> rtime.felk.cvut.cz Git - linux-imx.git/blob - fs/btrfs/relocation.c
Btrfs: fix all callers of read_tree_block
[linux-imx.git] / fs / btrfs / relocation.c
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32 #include "free-space-cache.h"
33 #include "inode-map.h"
34
35 /*
36  * backref_node, mapping_node and tree_block start with this
37  */
38 struct tree_entry {
39         struct rb_node rb_node;
40         u64 bytenr;
41 };
42
43 /*
44  * present a tree block in the backref cache
45  */
46 struct backref_node {
47         struct rb_node rb_node;
48         u64 bytenr;
49
50         u64 new_bytenr;
51         /* objectid of tree block owner, can be not uptodate */
52         u64 owner;
53         /* link to pending, changed or detached list */
54         struct list_head list;
55         /* list of upper level blocks reference this block */
56         struct list_head upper;
57         /* list of child blocks in the cache */
58         struct list_head lower;
59         /* NULL if this node is not tree root */
60         struct btrfs_root *root;
61         /* extent buffer got by COW the block */
62         struct extent_buffer *eb;
63         /* level of tree block */
64         unsigned int level:8;
65         /* is the block in non-reference counted tree */
66         unsigned int cowonly:1;
67         /* 1 if no child node in the cache */
68         unsigned int lowest:1;
69         /* is the extent buffer locked */
70         unsigned int locked:1;
71         /* has the block been processed */
72         unsigned int processed:1;
73         /* have backrefs of this block been checked */
74         unsigned int checked:1;
75         /*
76          * 1 if corresponding block has been cowed but some upper
77          * level block pointers may not point to the new location
78          */
79         unsigned int pending:1;
80         /*
81          * 1 if the backref node isn't connected to any other
82          * backref node.
83          */
84         unsigned int detached:1;
85 };
86
87 /*
88  * present a block pointer in the backref cache
89  */
90 struct backref_edge {
91         struct list_head list[2];
92         struct backref_node *node[2];
93 };
94
95 #define LOWER   0
96 #define UPPER   1
97
98 struct backref_cache {
99         /* red black tree of all backref nodes in the cache */
100         struct rb_root rb_root;
101         /* for passing backref nodes to btrfs_reloc_cow_block */
102         struct backref_node *path[BTRFS_MAX_LEVEL];
103         /*
104          * list of blocks that have been cowed but some block
105          * pointers in upper level blocks may not reflect the
106          * new location
107          */
108         struct list_head pending[BTRFS_MAX_LEVEL];
109         /* list of backref nodes with no child node */
110         struct list_head leaves;
111         /* list of blocks that have been cowed in current transaction */
112         struct list_head changed;
113         /* list of detached backref node. */
114         struct list_head detached;
115
116         u64 last_trans;
117
118         int nr_nodes;
119         int nr_edges;
120 };
121
122 /*
123  * map address of tree root to tree
124  */
125 struct mapping_node {
126         struct rb_node rb_node;
127         u64 bytenr;
128         void *data;
129 };
130
131 struct mapping_tree {
132         struct rb_root rb_root;
133         spinlock_t lock;
134 };
135
136 /*
137  * present a tree block to process
138  */
139 struct tree_block {
140         struct rb_node rb_node;
141         u64 bytenr;
142         struct btrfs_key key;
143         unsigned int level:8;
144         unsigned int key_ready:1;
145 };
146
147 #define MAX_EXTENTS 128
148
149 struct file_extent_cluster {
150         u64 start;
151         u64 end;
152         u64 boundary[MAX_EXTENTS];
153         unsigned int nr;
154 };
155
156 struct reloc_control {
157         /* block group to relocate */
158         struct btrfs_block_group_cache *block_group;
159         /* extent tree */
160         struct btrfs_root *extent_root;
161         /* inode for moving data */
162         struct inode *data_inode;
163
164         struct btrfs_block_rsv *block_rsv;
165
166         struct backref_cache backref_cache;
167
168         struct file_extent_cluster cluster;
169         /* tree blocks have been processed */
170         struct extent_io_tree processed_blocks;
171         /* map start of tree root to corresponding reloc tree */
172         struct mapping_tree reloc_root_tree;
173         /* list of reloc trees */
174         struct list_head reloc_roots;
175         /* size of metadata reservation for merging reloc trees */
176         u64 merging_rsv_size;
177         /* size of relocated tree nodes */
178         u64 nodes_relocated;
179
180         u64 search_start;
181         u64 extents_found;
182
183         unsigned int stage:8;
184         unsigned int create_reloc_tree:1;
185         unsigned int merge_reloc_tree:1;
186         unsigned int found_file_extent:1;
187         unsigned int commit_transaction:1;
188 };
189
190 /* stages of data relocation */
191 #define MOVE_DATA_EXTENTS       0
192 #define UPDATE_DATA_PTRS        1
193
194 static void remove_backref_node(struct backref_cache *cache,
195                                 struct backref_node *node);
196 static void __mark_block_processed(struct reloc_control *rc,
197                                    struct backref_node *node);
198
199 static void mapping_tree_init(struct mapping_tree *tree)
200 {
201         tree->rb_root = RB_ROOT;
202         spin_lock_init(&tree->lock);
203 }
204
205 static void backref_cache_init(struct backref_cache *cache)
206 {
207         int i;
208         cache->rb_root = RB_ROOT;
209         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
210                 INIT_LIST_HEAD(&cache->pending[i]);
211         INIT_LIST_HEAD(&cache->changed);
212         INIT_LIST_HEAD(&cache->detached);
213         INIT_LIST_HEAD(&cache->leaves);
214 }
215
216 static void backref_cache_cleanup(struct backref_cache *cache)
217 {
218         struct backref_node *node;
219         int i;
220
221         while (!list_empty(&cache->detached)) {
222                 node = list_entry(cache->detached.next,
223                                   struct backref_node, list);
224                 remove_backref_node(cache, node);
225         }
226
227         while (!list_empty(&cache->leaves)) {
228                 node = list_entry(cache->leaves.next,
229                                   struct backref_node, lower);
230                 remove_backref_node(cache, node);
231         }
232
233         cache->last_trans = 0;
234
235         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236                 BUG_ON(!list_empty(&cache->pending[i]));
237         BUG_ON(!list_empty(&cache->changed));
238         BUG_ON(!list_empty(&cache->detached));
239         BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240         BUG_ON(cache->nr_nodes);
241         BUG_ON(cache->nr_edges);
242 }
243
244 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245 {
246         struct backref_node *node;
247
248         node = kzalloc(sizeof(*node), GFP_NOFS);
249         if (node) {
250                 INIT_LIST_HEAD(&node->list);
251                 INIT_LIST_HEAD(&node->upper);
252                 INIT_LIST_HEAD(&node->lower);
253                 RB_CLEAR_NODE(&node->rb_node);
254                 cache->nr_nodes++;
255         }
256         return node;
257 }
258
259 static void free_backref_node(struct backref_cache *cache,
260                               struct backref_node *node)
261 {
262         if (node) {
263                 cache->nr_nodes--;
264                 kfree(node);
265         }
266 }
267
268 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269 {
270         struct backref_edge *edge;
271
272         edge = kzalloc(sizeof(*edge), GFP_NOFS);
273         if (edge)
274                 cache->nr_edges++;
275         return edge;
276 }
277
278 static void free_backref_edge(struct backref_cache *cache,
279                               struct backref_edge *edge)
280 {
281         if (edge) {
282                 cache->nr_edges--;
283                 kfree(edge);
284         }
285 }
286
287 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
288                                    struct rb_node *node)
289 {
290         struct rb_node **p = &root->rb_node;
291         struct rb_node *parent = NULL;
292         struct tree_entry *entry;
293
294         while (*p) {
295                 parent = *p;
296                 entry = rb_entry(parent, struct tree_entry, rb_node);
297
298                 if (bytenr < entry->bytenr)
299                         p = &(*p)->rb_left;
300                 else if (bytenr > entry->bytenr)
301                         p = &(*p)->rb_right;
302                 else
303                         return parent;
304         }
305
306         rb_link_node(node, parent, p);
307         rb_insert_color(node, root);
308         return NULL;
309 }
310
311 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
312 {
313         struct rb_node *n = root->rb_node;
314         struct tree_entry *entry;
315
316         while (n) {
317                 entry = rb_entry(n, struct tree_entry, rb_node);
318
319                 if (bytenr < entry->bytenr)
320                         n = n->rb_left;
321                 else if (bytenr > entry->bytenr)
322                         n = n->rb_right;
323                 else
324                         return n;
325         }
326         return NULL;
327 }
328
329 void backref_tree_panic(struct rb_node *rb_node, int errno,
330                                           u64 bytenr)
331 {
332
333         struct btrfs_fs_info *fs_info = NULL;
334         struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
335                                               rb_node);
336         if (bnode->root)
337                 fs_info = bnode->root->fs_info;
338         btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
339                     "found at offset %llu\n", (unsigned long long)bytenr);
340 }
341
342 /*
343  * walk up backref nodes until reach node presents tree root
344  */
345 static struct backref_node *walk_up_backref(struct backref_node *node,
346                                             struct backref_edge *edges[],
347                                             int *index)
348 {
349         struct backref_edge *edge;
350         int idx = *index;
351
352         while (!list_empty(&node->upper)) {
353                 edge = list_entry(node->upper.next,
354                                   struct backref_edge, list[LOWER]);
355                 edges[idx++] = edge;
356                 node = edge->node[UPPER];
357         }
358         BUG_ON(node->detached);
359         *index = idx;
360         return node;
361 }
362
363 /*
364  * walk down backref nodes to find start of next reference path
365  */
366 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
367                                               int *index)
368 {
369         struct backref_edge *edge;
370         struct backref_node *lower;
371         int idx = *index;
372
373         while (idx > 0) {
374                 edge = edges[idx - 1];
375                 lower = edge->node[LOWER];
376                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
377                         idx--;
378                         continue;
379                 }
380                 edge = list_entry(edge->list[LOWER].next,
381                                   struct backref_edge, list[LOWER]);
382                 edges[idx - 1] = edge;
383                 *index = idx;
384                 return edge->node[UPPER];
385         }
386         *index = 0;
387         return NULL;
388 }
389
390 static void unlock_node_buffer(struct backref_node *node)
391 {
392         if (node->locked) {
393                 btrfs_tree_unlock(node->eb);
394                 node->locked = 0;
395         }
396 }
397
398 static void drop_node_buffer(struct backref_node *node)
399 {
400         if (node->eb) {
401                 unlock_node_buffer(node);
402                 free_extent_buffer(node->eb);
403                 node->eb = NULL;
404         }
405 }
406
407 static void drop_backref_node(struct backref_cache *tree,
408                               struct backref_node *node)
409 {
410         BUG_ON(!list_empty(&node->upper));
411
412         drop_node_buffer(node);
413         list_del(&node->list);
414         list_del(&node->lower);
415         if (!RB_EMPTY_NODE(&node->rb_node))
416                 rb_erase(&node->rb_node, &tree->rb_root);
417         free_backref_node(tree, node);
418 }
419
420 /*
421  * remove a backref node from the backref cache
422  */
423 static void remove_backref_node(struct backref_cache *cache,
424                                 struct backref_node *node)
425 {
426         struct backref_node *upper;
427         struct backref_edge *edge;
428
429         if (!node)
430                 return;
431
432         BUG_ON(!node->lowest && !node->detached);
433         while (!list_empty(&node->upper)) {
434                 edge = list_entry(node->upper.next, struct backref_edge,
435                                   list[LOWER]);
436                 upper = edge->node[UPPER];
437                 list_del(&edge->list[LOWER]);
438                 list_del(&edge->list[UPPER]);
439                 free_backref_edge(cache, edge);
440
441                 if (RB_EMPTY_NODE(&upper->rb_node)) {
442                         BUG_ON(!list_empty(&node->upper));
443                         drop_backref_node(cache, node);
444                         node = upper;
445                         node->lowest = 1;
446                         continue;
447                 }
448                 /*
449                  * add the node to leaf node list if no other
450                  * child block cached.
451                  */
452                 if (list_empty(&upper->lower)) {
453                         list_add_tail(&upper->lower, &cache->leaves);
454                         upper->lowest = 1;
455                 }
456         }
457
458         drop_backref_node(cache, node);
459 }
460
461 static void update_backref_node(struct backref_cache *cache,
462                                 struct backref_node *node, u64 bytenr)
463 {
464         struct rb_node *rb_node;
465         rb_erase(&node->rb_node, &cache->rb_root);
466         node->bytenr = bytenr;
467         rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
468         if (rb_node)
469                 backref_tree_panic(rb_node, -EEXIST, bytenr);
470 }
471
472 /*
473  * update backref cache after a transaction commit
474  */
475 static int update_backref_cache(struct btrfs_trans_handle *trans,
476                                 struct backref_cache *cache)
477 {
478         struct backref_node *node;
479         int level = 0;
480
481         if (cache->last_trans == 0) {
482                 cache->last_trans = trans->transid;
483                 return 0;
484         }
485
486         if (cache->last_trans == trans->transid)
487                 return 0;
488
489         /*
490          * detached nodes are used to avoid unnecessary backref
491          * lookup. transaction commit changes the extent tree.
492          * so the detached nodes are no longer useful.
493          */
494         while (!list_empty(&cache->detached)) {
495                 node = list_entry(cache->detached.next,
496                                   struct backref_node, list);
497                 remove_backref_node(cache, node);
498         }
499
500         while (!list_empty(&cache->changed)) {
501                 node = list_entry(cache->changed.next,
502                                   struct backref_node, list);
503                 list_del_init(&node->list);
504                 BUG_ON(node->pending);
505                 update_backref_node(cache, node, node->new_bytenr);
506         }
507
508         /*
509          * some nodes can be left in the pending list if there were
510          * errors during processing the pending nodes.
511          */
512         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
513                 list_for_each_entry(node, &cache->pending[level], list) {
514                         BUG_ON(!node->pending);
515                         if (node->bytenr == node->new_bytenr)
516                                 continue;
517                         update_backref_node(cache, node, node->new_bytenr);
518                 }
519         }
520
521         cache->last_trans = 0;
522         return 1;
523 }
524
525
526 static int should_ignore_root(struct btrfs_root *root)
527 {
528         struct btrfs_root *reloc_root;
529
530         if (!root->ref_cows)
531                 return 0;
532
533         reloc_root = root->reloc_root;
534         if (!reloc_root)
535                 return 0;
536
537         if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
538             root->fs_info->running_transaction->transid - 1)
539                 return 0;
540         /*
541          * if there is reloc tree and it was created in previous
542          * transaction backref lookup can find the reloc tree,
543          * so backref node for the fs tree root is useless for
544          * relocation.
545          */
546         return 1;
547 }
548 /*
549  * find reloc tree by address of tree root
550  */
551 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
552                                           u64 bytenr)
553 {
554         struct rb_node *rb_node;
555         struct mapping_node *node;
556         struct btrfs_root *root = NULL;
557
558         spin_lock(&rc->reloc_root_tree.lock);
559         rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
560         if (rb_node) {
561                 node = rb_entry(rb_node, struct mapping_node, rb_node);
562                 root = (struct btrfs_root *)node->data;
563         }
564         spin_unlock(&rc->reloc_root_tree.lock);
565         return root;
566 }
567
568 static int is_cowonly_root(u64 root_objectid)
569 {
570         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
571             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
572             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
573             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
574             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
575             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
576                 return 1;
577         return 0;
578 }
579
580 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
581                                         u64 root_objectid)
582 {
583         struct btrfs_key key;
584
585         key.objectid = root_objectid;
586         key.type = BTRFS_ROOT_ITEM_KEY;
587         if (is_cowonly_root(root_objectid))
588                 key.offset = 0;
589         else
590                 key.offset = (u64)-1;
591
592         return btrfs_read_fs_root_no_name(fs_info, &key);
593 }
594
595 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
596 static noinline_for_stack
597 struct btrfs_root *find_tree_root(struct reloc_control *rc,
598                                   struct extent_buffer *leaf,
599                                   struct btrfs_extent_ref_v0 *ref0)
600 {
601         struct btrfs_root *root;
602         u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
603         u64 generation = btrfs_ref_generation_v0(leaf, ref0);
604
605         BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
606
607         root = read_fs_root(rc->extent_root->fs_info, root_objectid);
608         BUG_ON(IS_ERR(root));
609
610         if (root->ref_cows &&
611             generation != btrfs_root_generation(&root->root_item))
612                 return NULL;
613
614         return root;
615 }
616 #endif
617
618 static noinline_for_stack
619 int find_inline_backref(struct extent_buffer *leaf, int slot,
620                         unsigned long *ptr, unsigned long *end)
621 {
622         struct btrfs_key key;
623         struct btrfs_extent_item *ei;
624         struct btrfs_tree_block_info *bi;
625         u32 item_size;
626
627         btrfs_item_key_to_cpu(leaf, &key, slot);
628
629         item_size = btrfs_item_size_nr(leaf, slot);
630 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
631         if (item_size < sizeof(*ei)) {
632                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
633                 return 1;
634         }
635 #endif
636         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
637         WARN_ON(!(btrfs_extent_flags(leaf, ei) &
638                   BTRFS_EXTENT_FLAG_TREE_BLOCK));
639
640         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
641             item_size <= sizeof(*ei) + sizeof(*bi)) {
642                 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
643                 return 1;
644         }
645
646         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
647                 bi = (struct btrfs_tree_block_info *)(ei + 1);
648                 *ptr = (unsigned long)(bi + 1);
649         } else {
650                 *ptr = (unsigned long)(ei + 1);
651         }
652         *end = (unsigned long)ei + item_size;
653         return 0;
654 }
655
656 /*
657  * build backref tree for a given tree block. root of the backref tree
658  * corresponds the tree block, leaves of the backref tree correspond
659  * roots of b-trees that reference the tree block.
660  *
661  * the basic idea of this function is check backrefs of a given block
662  * to find upper level blocks that refernece the block, and then check
663  * bakcrefs of these upper level blocks recursively. the recursion stop
664  * when tree root is reached or backrefs for the block is cached.
665  *
666  * NOTE: if we find backrefs for a block are cached, we know backrefs
667  * for all upper level blocks that directly/indirectly reference the
668  * block are also cached.
669  */
670 static noinline_for_stack
671 struct backref_node *build_backref_tree(struct reloc_control *rc,
672                                         struct btrfs_key *node_key,
673                                         int level, u64 bytenr)
674 {
675         struct backref_cache *cache = &rc->backref_cache;
676         struct btrfs_path *path1;
677         struct btrfs_path *path2;
678         struct extent_buffer *eb;
679         struct btrfs_root *root;
680         struct backref_node *cur;
681         struct backref_node *upper;
682         struct backref_node *lower;
683         struct backref_node *node = NULL;
684         struct backref_node *exist = NULL;
685         struct backref_edge *edge;
686         struct rb_node *rb_node;
687         struct btrfs_key key;
688         unsigned long end;
689         unsigned long ptr;
690         LIST_HEAD(list);
691         LIST_HEAD(useless);
692         int cowonly;
693         int ret;
694         int err = 0;
695
696         path1 = btrfs_alloc_path();
697         path2 = btrfs_alloc_path();
698         if (!path1 || !path2) {
699                 err = -ENOMEM;
700                 goto out;
701         }
702         path1->reada = 1;
703         path2->reada = 2;
704
705         node = alloc_backref_node(cache);
706         if (!node) {
707                 err = -ENOMEM;
708                 goto out;
709         }
710
711         node->bytenr = bytenr;
712         node->level = level;
713         node->lowest = 1;
714         cur = node;
715 again:
716         end = 0;
717         ptr = 0;
718         key.objectid = cur->bytenr;
719         key.type = BTRFS_METADATA_ITEM_KEY;
720         key.offset = (u64)-1;
721
722         path1->search_commit_root = 1;
723         path1->skip_locking = 1;
724         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
725                                 0, 0);
726         if (ret < 0) {
727                 err = ret;
728                 goto out;
729         }
730         BUG_ON(!ret || !path1->slots[0]);
731
732         path1->slots[0]--;
733
734         WARN_ON(cur->checked);
735         if (!list_empty(&cur->upper)) {
736                 /*
737                  * the backref was added previously when processing
738                  * backref of type BTRFS_TREE_BLOCK_REF_KEY
739                  */
740                 BUG_ON(!list_is_singular(&cur->upper));
741                 edge = list_entry(cur->upper.next, struct backref_edge,
742                                   list[LOWER]);
743                 BUG_ON(!list_empty(&edge->list[UPPER]));
744                 exist = edge->node[UPPER];
745                 /*
746                  * add the upper level block to pending list if we need
747                  * check its backrefs
748                  */
749                 if (!exist->checked)
750                         list_add_tail(&edge->list[UPPER], &list);
751         } else {
752                 exist = NULL;
753         }
754
755         while (1) {
756                 cond_resched();
757                 eb = path1->nodes[0];
758
759                 if (ptr >= end) {
760                         if (path1->slots[0] >= btrfs_header_nritems(eb)) {
761                                 ret = btrfs_next_leaf(rc->extent_root, path1);
762                                 if (ret < 0) {
763                                         err = ret;
764                                         goto out;
765                                 }
766                                 if (ret > 0)
767                                         break;
768                                 eb = path1->nodes[0];
769                         }
770
771                         btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
772                         if (key.objectid != cur->bytenr) {
773                                 WARN_ON(exist);
774                                 break;
775                         }
776
777                         if (key.type == BTRFS_EXTENT_ITEM_KEY ||
778                             key.type == BTRFS_METADATA_ITEM_KEY) {
779                                 ret = find_inline_backref(eb, path1->slots[0],
780                                                           &ptr, &end);
781                                 if (ret)
782                                         goto next;
783                         }
784                 }
785
786                 if (ptr < end) {
787                         /* update key for inline back ref */
788                         struct btrfs_extent_inline_ref *iref;
789                         iref = (struct btrfs_extent_inline_ref *)ptr;
790                         key.type = btrfs_extent_inline_ref_type(eb, iref);
791                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
792                         WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
793                                 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
794                 }
795
796                 if (exist &&
797                     ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
798                       exist->owner == key.offset) ||
799                      (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
800                       exist->bytenr == key.offset))) {
801                         exist = NULL;
802                         goto next;
803                 }
804
805 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
806                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
807                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
808                         if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
809                                 struct btrfs_extent_ref_v0 *ref0;
810                                 ref0 = btrfs_item_ptr(eb, path1->slots[0],
811                                                 struct btrfs_extent_ref_v0);
812                                 if (key.objectid == key.offset) {
813                                         root = find_tree_root(rc, eb, ref0);
814                                         if (root && !should_ignore_root(root))
815                                                 cur->root = root;
816                                         else
817                                                 list_add(&cur->list, &useless);
818                                         break;
819                                 }
820                                 if (is_cowonly_root(btrfs_ref_root_v0(eb,
821                                                                       ref0)))
822                                         cur->cowonly = 1;
823                         }
824 #else
825                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
826                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
827 #endif
828                         if (key.objectid == key.offset) {
829                                 /*
830                                  * only root blocks of reloc trees use
831                                  * backref of this type.
832                                  */
833                                 root = find_reloc_root(rc, cur->bytenr);
834                                 BUG_ON(!root);
835                                 cur->root = root;
836                                 break;
837                         }
838
839                         edge = alloc_backref_edge(cache);
840                         if (!edge) {
841                                 err = -ENOMEM;
842                                 goto out;
843                         }
844                         rb_node = tree_search(&cache->rb_root, key.offset);
845                         if (!rb_node) {
846                                 upper = alloc_backref_node(cache);
847                                 if (!upper) {
848                                         free_backref_edge(cache, edge);
849                                         err = -ENOMEM;
850                                         goto out;
851                                 }
852                                 upper->bytenr = key.offset;
853                                 upper->level = cur->level + 1;
854                                 /*
855                                  *  backrefs for the upper level block isn't
856                                  *  cached, add the block to pending list
857                                  */
858                                 list_add_tail(&edge->list[UPPER], &list);
859                         } else {
860                                 upper = rb_entry(rb_node, struct backref_node,
861                                                  rb_node);
862                                 BUG_ON(!upper->checked);
863                                 INIT_LIST_HEAD(&edge->list[UPPER]);
864                         }
865                         list_add_tail(&edge->list[LOWER], &cur->upper);
866                         edge->node[LOWER] = cur;
867                         edge->node[UPPER] = upper;
868
869                         goto next;
870                 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
871                         goto next;
872                 }
873
874                 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
875                 root = read_fs_root(rc->extent_root->fs_info, key.offset);
876                 if (IS_ERR(root)) {
877                         err = PTR_ERR(root);
878                         goto out;
879                 }
880
881                 if (!root->ref_cows)
882                         cur->cowonly = 1;
883
884                 if (btrfs_root_level(&root->root_item) == cur->level) {
885                         /* tree root */
886                         BUG_ON(btrfs_root_bytenr(&root->root_item) !=
887                                cur->bytenr);
888                         if (should_ignore_root(root))
889                                 list_add(&cur->list, &useless);
890                         else
891                                 cur->root = root;
892                         break;
893                 }
894
895                 level = cur->level + 1;
896
897                 /*
898                  * searching the tree to find upper level blocks
899                  * reference the block.
900                  */
901                 path2->search_commit_root = 1;
902                 path2->skip_locking = 1;
903                 path2->lowest_level = level;
904                 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
905                 path2->lowest_level = 0;
906                 if (ret < 0) {
907                         err = ret;
908                         goto out;
909                 }
910                 if (ret > 0 && path2->slots[level] > 0)
911                         path2->slots[level]--;
912
913                 eb = path2->nodes[level];
914                 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
915                         cur->bytenr);
916
917                 lower = cur;
918                 for (; level < BTRFS_MAX_LEVEL; level++) {
919                         if (!path2->nodes[level]) {
920                                 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
921                                        lower->bytenr);
922                                 if (should_ignore_root(root))
923                                         list_add(&lower->list, &useless);
924                                 else
925                                         lower->root = root;
926                                 break;
927                         }
928
929                         edge = alloc_backref_edge(cache);
930                         if (!edge) {
931                                 err = -ENOMEM;
932                                 goto out;
933                         }
934
935                         eb = path2->nodes[level];
936                         rb_node = tree_search(&cache->rb_root, eb->start);
937                         if (!rb_node) {
938                                 upper = alloc_backref_node(cache);
939                                 if (!upper) {
940                                         free_backref_edge(cache, edge);
941                                         err = -ENOMEM;
942                                         goto out;
943                                 }
944                                 upper->bytenr = eb->start;
945                                 upper->owner = btrfs_header_owner(eb);
946                                 upper->level = lower->level + 1;
947                                 if (!root->ref_cows)
948                                         upper->cowonly = 1;
949
950                                 /*
951                                  * if we know the block isn't shared
952                                  * we can void checking its backrefs.
953                                  */
954                                 if (btrfs_block_can_be_shared(root, eb))
955                                         upper->checked = 0;
956                                 else
957                                         upper->checked = 1;
958
959                                 /*
960                                  * add the block to pending list if we
961                                  * need check its backrefs. only block
962                                  * at 'cur->level + 1' is added to the
963                                  * tail of pending list. this guarantees
964                                  * we check backrefs from lower level
965                                  * blocks to upper level blocks.
966                                  */
967                                 if (!upper->checked &&
968                                     level == cur->level + 1) {
969                                         list_add_tail(&edge->list[UPPER],
970                                                       &list);
971                                 } else
972                                         INIT_LIST_HEAD(&edge->list[UPPER]);
973                         } else {
974                                 upper = rb_entry(rb_node, struct backref_node,
975                                                  rb_node);
976                                 BUG_ON(!upper->checked);
977                                 INIT_LIST_HEAD(&edge->list[UPPER]);
978                                 if (!upper->owner)
979                                         upper->owner = btrfs_header_owner(eb);
980                         }
981                         list_add_tail(&edge->list[LOWER], &lower->upper);
982                         edge->node[LOWER] = lower;
983                         edge->node[UPPER] = upper;
984
985                         if (rb_node)
986                                 break;
987                         lower = upper;
988                         upper = NULL;
989                 }
990                 btrfs_release_path(path2);
991 next:
992                 if (ptr < end) {
993                         ptr += btrfs_extent_inline_ref_size(key.type);
994                         if (ptr >= end) {
995                                 WARN_ON(ptr > end);
996                                 ptr = 0;
997                                 end = 0;
998                         }
999                 }
1000                 if (ptr >= end)
1001                         path1->slots[0]++;
1002         }
1003         btrfs_release_path(path1);
1004
1005         cur->checked = 1;
1006         WARN_ON(exist);
1007
1008         /* the pending list isn't empty, take the first block to process */
1009         if (!list_empty(&list)) {
1010                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1011                 list_del_init(&edge->list[UPPER]);
1012                 cur = edge->node[UPPER];
1013                 goto again;
1014         }
1015
1016         /*
1017          * everything goes well, connect backref nodes and insert backref nodes
1018          * into the cache.
1019          */
1020         BUG_ON(!node->checked);
1021         cowonly = node->cowonly;
1022         if (!cowonly) {
1023                 rb_node = tree_insert(&cache->rb_root, node->bytenr,
1024                                       &node->rb_node);
1025                 if (rb_node)
1026                         backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1027                 list_add_tail(&node->lower, &cache->leaves);
1028         }
1029
1030         list_for_each_entry(edge, &node->upper, list[LOWER])
1031                 list_add_tail(&edge->list[UPPER], &list);
1032
1033         while (!list_empty(&list)) {
1034                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1035                 list_del_init(&edge->list[UPPER]);
1036                 upper = edge->node[UPPER];
1037                 if (upper->detached) {
1038                         list_del(&edge->list[LOWER]);
1039                         lower = edge->node[LOWER];
1040                         free_backref_edge(cache, edge);
1041                         if (list_empty(&lower->upper))
1042                                 list_add(&lower->list, &useless);
1043                         continue;
1044                 }
1045
1046                 if (!RB_EMPTY_NODE(&upper->rb_node)) {
1047                         if (upper->lowest) {
1048                                 list_del_init(&upper->lower);
1049                                 upper->lowest = 0;
1050                         }
1051
1052                         list_add_tail(&edge->list[UPPER], &upper->lower);
1053                         continue;
1054                 }
1055
1056                 BUG_ON(!upper->checked);
1057                 BUG_ON(cowonly != upper->cowonly);
1058                 if (!cowonly) {
1059                         rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1060                                               &upper->rb_node);
1061                         if (rb_node)
1062                                 backref_tree_panic(rb_node, -EEXIST,
1063                                                    upper->bytenr);
1064                 }
1065
1066                 list_add_tail(&edge->list[UPPER], &upper->lower);
1067
1068                 list_for_each_entry(edge, &upper->upper, list[LOWER])
1069                         list_add_tail(&edge->list[UPPER], &list);
1070         }
1071         /*
1072          * process useless backref nodes. backref nodes for tree leaves
1073          * are deleted from the cache. backref nodes for upper level
1074          * tree blocks are left in the cache to avoid unnecessary backref
1075          * lookup.
1076          */
1077         while (!list_empty(&useless)) {
1078                 upper = list_entry(useless.next, struct backref_node, list);
1079                 list_del_init(&upper->list);
1080                 BUG_ON(!list_empty(&upper->upper));
1081                 if (upper == node)
1082                         node = NULL;
1083                 if (upper->lowest) {
1084                         list_del_init(&upper->lower);
1085                         upper->lowest = 0;
1086                 }
1087                 while (!list_empty(&upper->lower)) {
1088                         edge = list_entry(upper->lower.next,
1089                                           struct backref_edge, list[UPPER]);
1090                         list_del(&edge->list[UPPER]);
1091                         list_del(&edge->list[LOWER]);
1092                         lower = edge->node[LOWER];
1093                         free_backref_edge(cache, edge);
1094
1095                         if (list_empty(&lower->upper))
1096                                 list_add(&lower->list, &useless);
1097                 }
1098                 __mark_block_processed(rc, upper);
1099                 if (upper->level > 0) {
1100                         list_add(&upper->list, &cache->detached);
1101                         upper->detached = 1;
1102                 } else {
1103                         rb_erase(&upper->rb_node, &cache->rb_root);
1104                         free_backref_node(cache, upper);
1105                 }
1106         }
1107 out:
1108         btrfs_free_path(path1);
1109         btrfs_free_path(path2);
1110         if (err) {
1111                 while (!list_empty(&useless)) {
1112                         lower = list_entry(useless.next,
1113                                            struct backref_node, upper);
1114                         list_del_init(&lower->upper);
1115                 }
1116                 upper = node;
1117                 INIT_LIST_HEAD(&list);
1118                 while (upper) {
1119                         if (RB_EMPTY_NODE(&upper->rb_node)) {
1120                                 list_splice_tail(&upper->upper, &list);
1121                                 free_backref_node(cache, upper);
1122                         }
1123
1124                         if (list_empty(&list))
1125                                 break;
1126
1127                         edge = list_entry(list.next, struct backref_edge,
1128                                           list[LOWER]);
1129                         list_del(&edge->list[LOWER]);
1130                         upper = edge->node[UPPER];
1131                         free_backref_edge(cache, edge);
1132                 }
1133                 return ERR_PTR(err);
1134         }
1135         BUG_ON(node && node->detached);
1136         return node;
1137 }
1138
1139 /*
1140  * helper to add backref node for the newly created snapshot.
1141  * the backref node is created by cloning backref node that
1142  * corresponds to root of source tree
1143  */
1144 static int clone_backref_node(struct btrfs_trans_handle *trans,
1145                               struct reloc_control *rc,
1146                               struct btrfs_root *src,
1147                               struct btrfs_root *dest)
1148 {
1149         struct btrfs_root *reloc_root = src->reloc_root;
1150         struct backref_cache *cache = &rc->backref_cache;
1151         struct backref_node *node = NULL;
1152         struct backref_node *new_node;
1153         struct backref_edge *edge;
1154         struct backref_edge *new_edge;
1155         struct rb_node *rb_node;
1156
1157         if (cache->last_trans > 0)
1158                 update_backref_cache(trans, cache);
1159
1160         rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1161         if (rb_node) {
1162                 node = rb_entry(rb_node, struct backref_node, rb_node);
1163                 if (node->detached)
1164                         node = NULL;
1165                 else
1166                         BUG_ON(node->new_bytenr != reloc_root->node->start);
1167         }
1168
1169         if (!node) {
1170                 rb_node = tree_search(&cache->rb_root,
1171                                       reloc_root->commit_root->start);
1172                 if (rb_node) {
1173                         node = rb_entry(rb_node, struct backref_node,
1174                                         rb_node);
1175                         BUG_ON(node->detached);
1176                 }
1177         }
1178
1179         if (!node)
1180                 return 0;
1181
1182         new_node = alloc_backref_node(cache);
1183         if (!new_node)
1184                 return -ENOMEM;
1185
1186         new_node->bytenr = dest->node->start;
1187         new_node->level = node->level;
1188         new_node->lowest = node->lowest;
1189         new_node->checked = 1;
1190         new_node->root = dest;
1191
1192         if (!node->lowest) {
1193                 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1194                         new_edge = alloc_backref_edge(cache);
1195                         if (!new_edge)
1196                                 goto fail;
1197
1198                         new_edge->node[UPPER] = new_node;
1199                         new_edge->node[LOWER] = edge->node[LOWER];
1200                         list_add_tail(&new_edge->list[UPPER],
1201                                       &new_node->lower);
1202                 }
1203         } else {
1204                 list_add_tail(&new_node->lower, &cache->leaves);
1205         }
1206
1207         rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1208                               &new_node->rb_node);
1209         if (rb_node)
1210                 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1211
1212         if (!new_node->lowest) {
1213                 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1214                         list_add_tail(&new_edge->list[LOWER],
1215                                       &new_edge->node[LOWER]->upper);
1216                 }
1217         }
1218         return 0;
1219 fail:
1220         while (!list_empty(&new_node->lower)) {
1221                 new_edge = list_entry(new_node->lower.next,
1222                                       struct backref_edge, list[UPPER]);
1223                 list_del(&new_edge->list[UPPER]);
1224                 free_backref_edge(cache, new_edge);
1225         }
1226         free_backref_node(cache, new_node);
1227         return -ENOMEM;
1228 }
1229
1230 /*
1231  * helper to add 'address of tree root -> reloc tree' mapping
1232  */
1233 static int __must_check __add_reloc_root(struct btrfs_root *root)
1234 {
1235         struct rb_node *rb_node;
1236         struct mapping_node *node;
1237         struct reloc_control *rc = root->fs_info->reloc_ctl;
1238
1239         node = kmalloc(sizeof(*node), GFP_NOFS);
1240         if (!node)
1241                 return -ENOMEM;
1242
1243         node->bytenr = root->node->start;
1244         node->data = root;
1245
1246         spin_lock(&rc->reloc_root_tree.lock);
1247         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1248                               node->bytenr, &node->rb_node);
1249         spin_unlock(&rc->reloc_root_tree.lock);
1250         if (rb_node) {
1251                 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1252                             "for start=%llu while inserting into relocation "
1253                             "tree\n", node->bytenr);
1254                 kfree(node);
1255                 return -EEXIST;
1256         }
1257
1258         list_add_tail(&root->root_list, &rc->reloc_roots);
1259         return 0;
1260 }
1261
1262 /*
1263  * helper to update/delete the 'address of tree root -> reloc tree'
1264  * mapping
1265  */
1266 static int __update_reloc_root(struct btrfs_root *root, int del)
1267 {
1268         struct rb_node *rb_node;
1269         struct mapping_node *node = NULL;
1270         struct reloc_control *rc = root->fs_info->reloc_ctl;
1271
1272         spin_lock(&rc->reloc_root_tree.lock);
1273         rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1274                               root->commit_root->start);
1275         if (rb_node) {
1276                 node = rb_entry(rb_node, struct mapping_node, rb_node);
1277                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1278         }
1279         spin_unlock(&rc->reloc_root_tree.lock);
1280
1281         if (!node)
1282                 return 0;
1283         BUG_ON((struct btrfs_root *)node->data != root);
1284
1285         if (!del) {
1286                 spin_lock(&rc->reloc_root_tree.lock);
1287                 node->bytenr = root->node->start;
1288                 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1289                                       node->bytenr, &node->rb_node);
1290                 spin_unlock(&rc->reloc_root_tree.lock);
1291                 if (rb_node)
1292                         backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1293         } else {
1294                 spin_lock(&root->fs_info->trans_lock);
1295                 list_del_init(&root->root_list);
1296                 spin_unlock(&root->fs_info->trans_lock);
1297                 kfree(node);
1298         }
1299         return 0;
1300 }
1301
1302 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1303                                         struct btrfs_root *root, u64 objectid)
1304 {
1305         struct btrfs_root *reloc_root;
1306         struct extent_buffer *eb;
1307         struct btrfs_root_item *root_item;
1308         struct btrfs_key root_key;
1309         int ret;
1310
1311         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1312         BUG_ON(!root_item);
1313
1314         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1315         root_key.type = BTRFS_ROOT_ITEM_KEY;
1316         root_key.offset = objectid;
1317
1318         if (root->root_key.objectid == objectid) {
1319                 /* called by btrfs_init_reloc_root */
1320                 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1321                                       BTRFS_TREE_RELOC_OBJECTID);
1322                 BUG_ON(ret);
1323
1324                 btrfs_set_root_last_snapshot(&root->root_item,
1325                                              trans->transid - 1);
1326         } else {
1327                 /*
1328                  * called by btrfs_reloc_post_snapshot_hook.
1329                  * the source tree is a reloc tree, all tree blocks
1330                  * modified after it was created have RELOC flag
1331                  * set in their headers. so it's OK to not update
1332                  * the 'last_snapshot'.
1333                  */
1334                 ret = btrfs_copy_root(trans, root, root->node, &eb,
1335                                       BTRFS_TREE_RELOC_OBJECTID);
1336                 BUG_ON(ret);
1337         }
1338
1339         memcpy(root_item, &root->root_item, sizeof(*root_item));
1340         btrfs_set_root_bytenr(root_item, eb->start);
1341         btrfs_set_root_level(root_item, btrfs_header_level(eb));
1342         btrfs_set_root_generation(root_item, trans->transid);
1343
1344         if (root->root_key.objectid == objectid) {
1345                 btrfs_set_root_refs(root_item, 0);
1346                 memset(&root_item->drop_progress, 0,
1347                        sizeof(struct btrfs_disk_key));
1348                 root_item->drop_level = 0;
1349         }
1350
1351         btrfs_tree_unlock(eb);
1352         free_extent_buffer(eb);
1353
1354         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1355                                 &root_key, root_item);
1356         BUG_ON(ret);
1357         kfree(root_item);
1358
1359         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
1360                                                  &root_key);
1361         BUG_ON(IS_ERR(reloc_root));
1362         reloc_root->last_trans = trans->transid;
1363         return reloc_root;
1364 }
1365
1366 /*
1367  * create reloc tree for a given fs tree. reloc tree is just a
1368  * snapshot of the fs tree with special root objectid.
1369  */
1370 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1371                           struct btrfs_root *root)
1372 {
1373         struct btrfs_root *reloc_root;
1374         struct reloc_control *rc = root->fs_info->reloc_ctl;
1375         int clear_rsv = 0;
1376         int ret;
1377
1378         if (root->reloc_root) {
1379                 reloc_root = root->reloc_root;
1380                 reloc_root->last_trans = trans->transid;
1381                 return 0;
1382         }
1383
1384         if (!rc || !rc->create_reloc_tree ||
1385             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1386                 return 0;
1387
1388         if (!trans->block_rsv) {
1389                 trans->block_rsv = rc->block_rsv;
1390                 clear_rsv = 1;
1391         }
1392         reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1393         if (clear_rsv)
1394                 trans->block_rsv = NULL;
1395
1396         ret = __add_reloc_root(reloc_root);
1397         BUG_ON(ret < 0);
1398         root->reloc_root = reloc_root;
1399         return 0;
1400 }
1401
1402 /*
1403  * update root item of reloc tree
1404  */
1405 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1406                             struct btrfs_root *root)
1407 {
1408         struct btrfs_root *reloc_root;
1409         struct btrfs_root_item *root_item;
1410         int del = 0;
1411         int ret;
1412
1413         if (!root->reloc_root)
1414                 goto out;
1415
1416         reloc_root = root->reloc_root;
1417         root_item = &reloc_root->root_item;
1418
1419         if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1420             btrfs_root_refs(root_item) == 0) {
1421                 root->reloc_root = NULL;
1422                 del = 1;
1423         }
1424
1425         __update_reloc_root(reloc_root, del);
1426
1427         if (reloc_root->commit_root != reloc_root->node) {
1428                 btrfs_set_root_node(root_item, reloc_root->node);
1429                 free_extent_buffer(reloc_root->commit_root);
1430                 reloc_root->commit_root = btrfs_root_node(reloc_root);
1431         }
1432
1433         ret = btrfs_update_root(trans, root->fs_info->tree_root,
1434                                 &reloc_root->root_key, root_item);
1435         BUG_ON(ret);
1436
1437 out:
1438         return 0;
1439 }
1440
1441 /*
1442  * helper to find first cached inode with inode number >= objectid
1443  * in a subvolume
1444  */
1445 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1446 {
1447         struct rb_node *node;
1448         struct rb_node *prev;
1449         struct btrfs_inode *entry;
1450         struct inode *inode;
1451
1452         spin_lock(&root->inode_lock);
1453 again:
1454         node = root->inode_tree.rb_node;
1455         prev = NULL;
1456         while (node) {
1457                 prev = node;
1458                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1459
1460                 if (objectid < btrfs_ino(&entry->vfs_inode))
1461                         node = node->rb_left;
1462                 else if (objectid > btrfs_ino(&entry->vfs_inode))
1463                         node = node->rb_right;
1464                 else
1465                         break;
1466         }
1467         if (!node) {
1468                 while (prev) {
1469                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
1470                         if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1471                                 node = prev;
1472                                 break;
1473                         }
1474                         prev = rb_next(prev);
1475                 }
1476         }
1477         while (node) {
1478                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1479                 inode = igrab(&entry->vfs_inode);
1480                 if (inode) {
1481                         spin_unlock(&root->inode_lock);
1482                         return inode;
1483                 }
1484
1485                 objectid = btrfs_ino(&entry->vfs_inode) + 1;
1486                 if (cond_resched_lock(&root->inode_lock))
1487                         goto again;
1488
1489                 node = rb_next(node);
1490         }
1491         spin_unlock(&root->inode_lock);
1492         return NULL;
1493 }
1494
1495 static int in_block_group(u64 bytenr,
1496                           struct btrfs_block_group_cache *block_group)
1497 {
1498         if (bytenr >= block_group->key.objectid &&
1499             bytenr < block_group->key.objectid + block_group->key.offset)
1500                 return 1;
1501         return 0;
1502 }
1503
1504 /*
1505  * get new location of data
1506  */
1507 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1508                             u64 bytenr, u64 num_bytes)
1509 {
1510         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1511         struct btrfs_path *path;
1512         struct btrfs_file_extent_item *fi;
1513         struct extent_buffer *leaf;
1514         int ret;
1515
1516         path = btrfs_alloc_path();
1517         if (!path)
1518                 return -ENOMEM;
1519
1520         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1521         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1522                                        bytenr, 0);
1523         if (ret < 0)
1524                 goto out;
1525         if (ret > 0) {
1526                 ret = -ENOENT;
1527                 goto out;
1528         }
1529
1530         leaf = path->nodes[0];
1531         fi = btrfs_item_ptr(leaf, path->slots[0],
1532                             struct btrfs_file_extent_item);
1533
1534         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1535                btrfs_file_extent_compression(leaf, fi) ||
1536                btrfs_file_extent_encryption(leaf, fi) ||
1537                btrfs_file_extent_other_encoding(leaf, fi));
1538
1539         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1540                 ret = 1;
1541                 goto out;
1542         }
1543
1544         *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1545         ret = 0;
1546 out:
1547         btrfs_free_path(path);
1548         return ret;
1549 }
1550
1551 /*
1552  * update file extent items in the tree leaf to point to
1553  * the new locations.
1554  */
1555 static noinline_for_stack
1556 int replace_file_extents(struct btrfs_trans_handle *trans,
1557                          struct reloc_control *rc,
1558                          struct btrfs_root *root,
1559                          struct extent_buffer *leaf)
1560 {
1561         struct btrfs_key key;
1562         struct btrfs_file_extent_item *fi;
1563         struct inode *inode = NULL;
1564         u64 parent;
1565         u64 bytenr;
1566         u64 new_bytenr = 0;
1567         u64 num_bytes;
1568         u64 end;
1569         u32 nritems;
1570         u32 i;
1571         int ret;
1572         int first = 1;
1573         int dirty = 0;
1574
1575         if (rc->stage != UPDATE_DATA_PTRS)
1576                 return 0;
1577
1578         /* reloc trees always use full backref */
1579         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1580                 parent = leaf->start;
1581         else
1582                 parent = 0;
1583
1584         nritems = btrfs_header_nritems(leaf);
1585         for (i = 0; i < nritems; i++) {
1586                 cond_resched();
1587                 btrfs_item_key_to_cpu(leaf, &key, i);
1588                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1589                         continue;
1590                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1591                 if (btrfs_file_extent_type(leaf, fi) ==
1592                     BTRFS_FILE_EXTENT_INLINE)
1593                         continue;
1594                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1595                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1596                 if (bytenr == 0)
1597                         continue;
1598                 if (!in_block_group(bytenr, rc->block_group))
1599                         continue;
1600
1601                 /*
1602                  * if we are modifying block in fs tree, wait for readpage
1603                  * to complete and drop the extent cache
1604                  */
1605                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1606                         if (first) {
1607                                 inode = find_next_inode(root, key.objectid);
1608                                 first = 0;
1609                         } else if (inode && btrfs_ino(inode) < key.objectid) {
1610                                 btrfs_add_delayed_iput(inode);
1611                                 inode = find_next_inode(root, key.objectid);
1612                         }
1613                         if (inode && btrfs_ino(inode) == key.objectid) {
1614                                 end = key.offset +
1615                                       btrfs_file_extent_num_bytes(leaf, fi);
1616                                 WARN_ON(!IS_ALIGNED(key.offset,
1617                                                     root->sectorsize));
1618                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1619                                 end--;
1620                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1621                                                       key.offset, end);
1622                                 if (!ret)
1623                                         continue;
1624
1625                                 btrfs_drop_extent_cache(inode, key.offset, end,
1626                                                         1);
1627                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1628                                               key.offset, end);
1629                         }
1630                 }
1631
1632                 ret = get_new_location(rc->data_inode, &new_bytenr,
1633                                        bytenr, num_bytes);
1634                 if (ret > 0) {
1635                         WARN_ON(1);
1636                         continue;
1637                 }
1638                 BUG_ON(ret < 0);
1639
1640                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1641                 dirty = 1;
1642
1643                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1644                 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1645                                            num_bytes, parent,
1646                                            btrfs_header_owner(leaf),
1647                                            key.objectid, key.offset, 1);
1648                 BUG_ON(ret);
1649
1650                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1651                                         parent, btrfs_header_owner(leaf),
1652                                         key.objectid, key.offset, 1);
1653                 BUG_ON(ret);
1654         }
1655         if (dirty)
1656                 btrfs_mark_buffer_dirty(leaf);
1657         if (inode)
1658                 btrfs_add_delayed_iput(inode);
1659         return 0;
1660 }
1661
1662 static noinline_for_stack
1663 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1664                      struct btrfs_path *path, int level)
1665 {
1666         struct btrfs_disk_key key1;
1667         struct btrfs_disk_key key2;
1668         btrfs_node_key(eb, &key1, slot);
1669         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1670         return memcmp(&key1, &key2, sizeof(key1));
1671 }
1672
1673 /*
1674  * try to replace tree blocks in fs tree with the new blocks
1675  * in reloc tree. tree blocks haven't been modified since the
1676  * reloc tree was create can be replaced.
1677  *
1678  * if a block was replaced, level of the block + 1 is returned.
1679  * if no block got replaced, 0 is returned. if there are other
1680  * errors, a negative error number is returned.
1681  */
1682 static noinline_for_stack
1683 int replace_path(struct btrfs_trans_handle *trans,
1684                  struct btrfs_root *dest, struct btrfs_root *src,
1685                  struct btrfs_path *path, struct btrfs_key *next_key,
1686                  int lowest_level, int max_level)
1687 {
1688         struct extent_buffer *eb;
1689         struct extent_buffer *parent;
1690         struct btrfs_key key;
1691         u64 old_bytenr;
1692         u64 new_bytenr;
1693         u64 old_ptr_gen;
1694         u64 new_ptr_gen;
1695         u64 last_snapshot;
1696         u32 blocksize;
1697         int cow = 0;
1698         int level;
1699         int ret;
1700         int slot;
1701
1702         BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1703         BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1704
1705         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1706 again:
1707         slot = path->slots[lowest_level];
1708         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1709
1710         eb = btrfs_lock_root_node(dest);
1711         btrfs_set_lock_blocking(eb);
1712         level = btrfs_header_level(eb);
1713
1714         if (level < lowest_level) {
1715                 btrfs_tree_unlock(eb);
1716                 free_extent_buffer(eb);
1717                 return 0;
1718         }
1719
1720         if (cow) {
1721                 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1722                 BUG_ON(ret);
1723         }
1724         btrfs_set_lock_blocking(eb);
1725
1726         if (next_key) {
1727                 next_key->objectid = (u64)-1;
1728                 next_key->type = (u8)-1;
1729                 next_key->offset = (u64)-1;
1730         }
1731
1732         parent = eb;
1733         while (1) {
1734                 level = btrfs_header_level(parent);
1735                 BUG_ON(level < lowest_level);
1736
1737                 ret = btrfs_bin_search(parent, &key, level, &slot);
1738                 if (ret && slot > 0)
1739                         slot--;
1740
1741                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1742                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1743
1744                 old_bytenr = btrfs_node_blockptr(parent, slot);
1745                 blocksize = btrfs_level_size(dest, level - 1);
1746                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1747
1748                 if (level <= max_level) {
1749                         eb = path->nodes[level];
1750                         new_bytenr = btrfs_node_blockptr(eb,
1751                                                         path->slots[level]);
1752                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1753                                                         path->slots[level]);
1754                 } else {
1755                         new_bytenr = 0;
1756                         new_ptr_gen = 0;
1757                 }
1758
1759                 if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1760                         WARN_ON(1);
1761                         ret = level;
1762                         break;
1763                 }
1764
1765                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1766                     memcmp_node_keys(parent, slot, path, level)) {
1767                         if (level <= lowest_level) {
1768                                 ret = 0;
1769                                 break;
1770                         }
1771
1772                         eb = read_tree_block(dest, old_bytenr, blocksize,
1773                                              old_ptr_gen);
1774                         if (!eb || !extent_buffer_uptodate(eb)) {
1775                                 ret = (!eb) ? -ENOMEM : -EIO;
1776                                 free_extent_buffer(eb);
1777                                 return ret;
1778                         }
1779                         btrfs_tree_lock(eb);
1780                         if (cow) {
1781                                 ret = btrfs_cow_block(trans, dest, eb, parent,
1782                                                       slot, &eb);
1783                                 BUG_ON(ret);
1784                         }
1785                         btrfs_set_lock_blocking(eb);
1786
1787                         btrfs_tree_unlock(parent);
1788                         free_extent_buffer(parent);
1789
1790                         parent = eb;
1791                         continue;
1792                 }
1793
1794                 if (!cow) {
1795                         btrfs_tree_unlock(parent);
1796                         free_extent_buffer(parent);
1797                         cow = 1;
1798                         goto again;
1799                 }
1800
1801                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1802                                       path->slots[level]);
1803                 btrfs_release_path(path);
1804
1805                 path->lowest_level = level;
1806                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1807                 path->lowest_level = 0;
1808                 BUG_ON(ret);
1809
1810                 /*
1811                  * swap blocks in fs tree and reloc tree.
1812                  */
1813                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1814                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1815                 btrfs_mark_buffer_dirty(parent);
1816
1817                 btrfs_set_node_blockptr(path->nodes[level],
1818                                         path->slots[level], old_bytenr);
1819                 btrfs_set_node_ptr_generation(path->nodes[level],
1820                                               path->slots[level], old_ptr_gen);
1821                 btrfs_mark_buffer_dirty(path->nodes[level]);
1822
1823                 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1824                                         path->nodes[level]->start,
1825                                         src->root_key.objectid, level - 1, 0,
1826                                         1);
1827                 BUG_ON(ret);
1828                 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1829                                         0, dest->root_key.objectid, level - 1,
1830                                         0, 1);
1831                 BUG_ON(ret);
1832
1833                 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1834                                         path->nodes[level]->start,
1835                                         src->root_key.objectid, level - 1, 0,
1836                                         1);
1837                 BUG_ON(ret);
1838
1839                 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1840                                         0, dest->root_key.objectid, level - 1,
1841                                         0, 1);
1842                 BUG_ON(ret);
1843
1844                 btrfs_unlock_up_safe(path, 0);
1845
1846                 ret = level;
1847                 break;
1848         }
1849         btrfs_tree_unlock(parent);
1850         free_extent_buffer(parent);
1851         return ret;
1852 }
1853
1854 /*
1855  * helper to find next relocated block in reloc tree
1856  */
1857 static noinline_for_stack
1858 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1859                        int *level)
1860 {
1861         struct extent_buffer *eb;
1862         int i;
1863         u64 last_snapshot;
1864         u32 nritems;
1865
1866         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1867
1868         for (i = 0; i < *level; i++) {
1869                 free_extent_buffer(path->nodes[i]);
1870                 path->nodes[i] = NULL;
1871         }
1872
1873         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1874                 eb = path->nodes[i];
1875                 nritems = btrfs_header_nritems(eb);
1876                 while (path->slots[i] + 1 < nritems) {
1877                         path->slots[i]++;
1878                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1879                             last_snapshot)
1880                                 continue;
1881
1882                         *level = i;
1883                         return 0;
1884                 }
1885                 free_extent_buffer(path->nodes[i]);
1886                 path->nodes[i] = NULL;
1887         }
1888         return 1;
1889 }
1890
1891 /*
1892  * walk down reloc tree to find relocated block of lowest level
1893  */
1894 static noinline_for_stack
1895 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1896                          int *level)
1897 {
1898         struct extent_buffer *eb = NULL;
1899         int i;
1900         u64 bytenr;
1901         u64 ptr_gen = 0;
1902         u64 last_snapshot;
1903         u32 blocksize;
1904         u32 nritems;
1905
1906         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1907
1908         for (i = *level; i > 0; i--) {
1909                 eb = path->nodes[i];
1910                 nritems = btrfs_header_nritems(eb);
1911                 while (path->slots[i] < nritems) {
1912                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1913                         if (ptr_gen > last_snapshot)
1914                                 break;
1915                         path->slots[i]++;
1916                 }
1917                 if (path->slots[i] >= nritems) {
1918                         if (i == *level)
1919                                 break;
1920                         *level = i + 1;
1921                         return 0;
1922                 }
1923                 if (i == 1) {
1924                         *level = i;
1925                         return 0;
1926                 }
1927
1928                 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1929                 blocksize = btrfs_level_size(root, i - 1);
1930                 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1931                 if (!eb || !extent_buffer_uptodate(eb)) {
1932                         free_extent_buffer(eb);
1933                         return -EIO;
1934                 }
1935                 BUG_ON(btrfs_header_level(eb) != i - 1);
1936                 path->nodes[i - 1] = eb;
1937                 path->slots[i - 1] = 0;
1938         }
1939         return 1;
1940 }
1941
1942 /*
1943  * invalidate extent cache for file extents whose key in range of
1944  * [min_key, max_key)
1945  */
1946 static int invalidate_extent_cache(struct btrfs_root *root,
1947                                    struct btrfs_key *min_key,
1948                                    struct btrfs_key *max_key)
1949 {
1950         struct inode *inode = NULL;
1951         u64 objectid;
1952         u64 start, end;
1953         u64 ino;
1954
1955         objectid = min_key->objectid;
1956         while (1) {
1957                 cond_resched();
1958                 iput(inode);
1959
1960                 if (objectid > max_key->objectid)
1961                         break;
1962
1963                 inode = find_next_inode(root, objectid);
1964                 if (!inode)
1965                         break;
1966                 ino = btrfs_ino(inode);
1967
1968                 if (ino > max_key->objectid) {
1969                         iput(inode);
1970                         break;
1971                 }
1972
1973                 objectid = ino + 1;
1974                 if (!S_ISREG(inode->i_mode))
1975                         continue;
1976
1977                 if (unlikely(min_key->objectid == ino)) {
1978                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1979                                 continue;
1980                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1981                                 start = 0;
1982                         else {
1983                                 start = min_key->offset;
1984                                 WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1985                         }
1986                 } else {
1987                         start = 0;
1988                 }
1989
1990                 if (unlikely(max_key->objectid == ino)) {
1991                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1992                                 continue;
1993                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1994                                 end = (u64)-1;
1995                         } else {
1996                                 if (max_key->offset == 0)
1997                                         continue;
1998                                 end = max_key->offset;
1999                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
2000                                 end--;
2001                         }
2002                 } else {
2003                         end = (u64)-1;
2004                 }
2005
2006                 /* the lock_extent waits for readpage to complete */
2007                 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2008                 btrfs_drop_extent_cache(inode, start, end, 1);
2009                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2010         }
2011         return 0;
2012 }
2013
2014 static int find_next_key(struct btrfs_path *path, int level,
2015                          struct btrfs_key *key)
2016
2017 {
2018         while (level < BTRFS_MAX_LEVEL) {
2019                 if (!path->nodes[level])
2020                         break;
2021                 if (path->slots[level] + 1 <
2022                     btrfs_header_nritems(path->nodes[level])) {
2023                         btrfs_node_key_to_cpu(path->nodes[level], key,
2024                                               path->slots[level] + 1);
2025                         return 0;
2026                 }
2027                 level++;
2028         }
2029         return 1;
2030 }
2031
2032 /*
2033  * merge the relocated tree blocks in reloc tree with corresponding
2034  * fs tree.
2035  */
2036 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2037                                                struct btrfs_root *root)
2038 {
2039         LIST_HEAD(inode_list);
2040         struct btrfs_key key;
2041         struct btrfs_key next_key;
2042         struct btrfs_trans_handle *trans;
2043         struct btrfs_root *reloc_root;
2044         struct btrfs_root_item *root_item;
2045         struct btrfs_path *path;
2046         struct extent_buffer *leaf;
2047         int level;
2048         int max_level;
2049         int replaced = 0;
2050         int ret;
2051         int err = 0;
2052         u32 min_reserved;
2053
2054         path = btrfs_alloc_path();
2055         if (!path)
2056                 return -ENOMEM;
2057         path->reada = 1;
2058
2059         reloc_root = root->reloc_root;
2060         root_item = &reloc_root->root_item;
2061
2062         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2063                 level = btrfs_root_level(root_item);
2064                 extent_buffer_get(reloc_root->node);
2065                 path->nodes[level] = reloc_root->node;
2066                 path->slots[level] = 0;
2067         } else {
2068                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2069
2070                 level = root_item->drop_level;
2071                 BUG_ON(level == 0);
2072                 path->lowest_level = level;
2073                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2074                 path->lowest_level = 0;
2075                 if (ret < 0) {
2076                         btrfs_free_path(path);
2077                         return ret;
2078                 }
2079
2080                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2081                                       path->slots[level]);
2082                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2083
2084                 btrfs_unlock_up_safe(path, 0);
2085         }
2086
2087         min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2088         memset(&next_key, 0, sizeof(next_key));
2089
2090         while (1) {
2091                 trans = btrfs_start_transaction(root, 0);
2092                 BUG_ON(IS_ERR(trans));
2093                 trans->block_rsv = rc->block_rsv;
2094
2095                 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2096                                              BTRFS_RESERVE_FLUSH_ALL);
2097                 if (ret) {
2098                         BUG_ON(ret != -EAGAIN);
2099                         ret = btrfs_commit_transaction(trans, root);
2100                         BUG_ON(ret);
2101                         continue;
2102                 }
2103
2104                 replaced = 0;
2105                 max_level = level;
2106
2107                 ret = walk_down_reloc_tree(reloc_root, path, &level);
2108                 if (ret < 0) {
2109                         err = ret;
2110                         goto out;
2111                 }
2112                 if (ret > 0)
2113                         break;
2114
2115                 if (!find_next_key(path, level, &key) &&
2116                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2117                         ret = 0;
2118                 } else {
2119                         ret = replace_path(trans, root, reloc_root, path,
2120                                            &next_key, level, max_level);
2121                 }
2122                 if (ret < 0) {
2123                         err = ret;
2124                         goto out;
2125                 }
2126
2127                 if (ret > 0) {
2128                         level = ret;
2129                         btrfs_node_key_to_cpu(path->nodes[level], &key,
2130                                               path->slots[level]);
2131                         replaced = 1;
2132                 }
2133
2134                 ret = walk_up_reloc_tree(reloc_root, path, &level);
2135                 if (ret > 0)
2136                         break;
2137
2138                 BUG_ON(level == 0);
2139                 /*
2140                  * save the merging progress in the drop_progress.
2141                  * this is OK since root refs == 1 in this case.
2142                  */
2143                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2144                                path->slots[level]);
2145                 root_item->drop_level = level;
2146
2147                 btrfs_end_transaction_throttle(trans, root);
2148
2149                 btrfs_btree_balance_dirty(root);
2150
2151                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2152                         invalidate_extent_cache(root, &key, &next_key);
2153         }
2154
2155         /*
2156          * handle the case only one block in the fs tree need to be
2157          * relocated and the block is tree root.
2158          */
2159         leaf = btrfs_lock_root_node(root);
2160         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2161         btrfs_tree_unlock(leaf);
2162         free_extent_buffer(leaf);
2163         if (ret < 0)
2164                 err = ret;
2165 out:
2166         btrfs_free_path(path);
2167
2168         if (err == 0) {
2169                 memset(&root_item->drop_progress, 0,
2170                        sizeof(root_item->drop_progress));
2171                 root_item->drop_level = 0;
2172                 btrfs_set_root_refs(root_item, 0);
2173                 btrfs_update_reloc_root(trans, root);
2174         }
2175
2176         btrfs_end_transaction_throttle(trans, root);
2177
2178         btrfs_btree_balance_dirty(root);
2179
2180         if (replaced && rc->stage == UPDATE_DATA_PTRS)
2181                 invalidate_extent_cache(root, &key, &next_key);
2182
2183         return err;
2184 }
2185
2186 static noinline_for_stack
2187 int prepare_to_merge(struct reloc_control *rc, int err)
2188 {
2189         struct btrfs_root *root = rc->extent_root;
2190         struct btrfs_root *reloc_root;
2191         struct btrfs_trans_handle *trans;
2192         LIST_HEAD(reloc_roots);
2193         u64 num_bytes = 0;
2194         int ret;
2195
2196         mutex_lock(&root->fs_info->reloc_mutex);
2197         rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2198         rc->merging_rsv_size += rc->nodes_relocated * 2;
2199         mutex_unlock(&root->fs_info->reloc_mutex);
2200
2201 again:
2202         if (!err) {
2203                 num_bytes = rc->merging_rsv_size;
2204                 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2205                                           BTRFS_RESERVE_FLUSH_ALL);
2206                 if (ret)
2207                         err = ret;
2208         }
2209
2210         trans = btrfs_join_transaction(rc->extent_root);
2211         if (IS_ERR(trans)) {
2212                 if (!err)
2213                         btrfs_block_rsv_release(rc->extent_root,
2214                                                 rc->block_rsv, num_bytes);
2215                 return PTR_ERR(trans);
2216         }
2217
2218         if (!err) {
2219                 if (num_bytes != rc->merging_rsv_size) {
2220                         btrfs_end_transaction(trans, rc->extent_root);
2221                         btrfs_block_rsv_release(rc->extent_root,
2222                                                 rc->block_rsv, num_bytes);
2223                         goto again;
2224                 }
2225         }
2226
2227         rc->merge_reloc_tree = 1;
2228
2229         while (!list_empty(&rc->reloc_roots)) {
2230                 reloc_root = list_entry(rc->reloc_roots.next,
2231                                         struct btrfs_root, root_list);
2232                 list_del_init(&reloc_root->root_list);
2233
2234                 root = read_fs_root(reloc_root->fs_info,
2235                                     reloc_root->root_key.offset);
2236                 BUG_ON(IS_ERR(root));
2237                 BUG_ON(root->reloc_root != reloc_root);
2238
2239                 /*
2240                  * set reference count to 1, so btrfs_recover_relocation
2241                  * knows it should resumes merging
2242                  */
2243                 if (!err)
2244                         btrfs_set_root_refs(&reloc_root->root_item, 1);
2245                 btrfs_update_reloc_root(trans, root);
2246
2247                 list_add(&reloc_root->root_list, &reloc_roots);
2248         }
2249
2250         list_splice(&reloc_roots, &rc->reloc_roots);
2251
2252         if (!err)
2253                 btrfs_commit_transaction(trans, rc->extent_root);
2254         else
2255                 btrfs_end_transaction(trans, rc->extent_root);
2256         return err;
2257 }
2258
2259 static noinline_for_stack
2260 void free_reloc_roots(struct list_head *list)
2261 {
2262         struct btrfs_root *reloc_root;
2263
2264         while (!list_empty(list)) {
2265                 reloc_root = list_entry(list->next, struct btrfs_root,
2266                                         root_list);
2267                 __update_reloc_root(reloc_root, 1);
2268                 free_extent_buffer(reloc_root->node);
2269                 free_extent_buffer(reloc_root->commit_root);
2270                 kfree(reloc_root);
2271         }
2272 }
2273
2274 static noinline_for_stack
2275 int merge_reloc_roots(struct reloc_control *rc)
2276 {
2277         struct btrfs_root *root;
2278         struct btrfs_root *reloc_root;
2279         LIST_HEAD(reloc_roots);
2280         int found = 0;
2281         int ret = 0;
2282 again:
2283         root = rc->extent_root;
2284
2285         /*
2286          * this serializes us with btrfs_record_root_in_transaction,
2287          * we have to make sure nobody is in the middle of
2288          * adding their roots to the list while we are
2289          * doing this splice
2290          */
2291         mutex_lock(&root->fs_info->reloc_mutex);
2292         list_splice_init(&rc->reloc_roots, &reloc_roots);
2293         mutex_unlock(&root->fs_info->reloc_mutex);
2294
2295         while (!list_empty(&reloc_roots)) {
2296                 found = 1;
2297                 reloc_root = list_entry(reloc_roots.next,
2298                                         struct btrfs_root, root_list);
2299
2300                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2301                         root = read_fs_root(reloc_root->fs_info,
2302                                             reloc_root->root_key.offset);
2303                         BUG_ON(IS_ERR(root));
2304                         BUG_ON(root->reloc_root != reloc_root);
2305
2306                         ret = merge_reloc_root(rc, root);
2307                         if (ret)
2308                                 goto out;
2309                 } else {
2310                         list_del_init(&reloc_root->root_list);
2311                 }
2312                 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2313                 if (ret < 0) {
2314                         if (list_empty(&reloc_root->root_list))
2315                                 list_add_tail(&reloc_root->root_list,
2316                                               &reloc_roots);
2317                         goto out;
2318                 }
2319         }
2320
2321         if (found) {
2322                 found = 0;
2323                 goto again;
2324         }
2325 out:
2326         if (ret) {
2327                 btrfs_std_error(root->fs_info, ret);
2328                 if (!list_empty(&reloc_roots))
2329                         free_reloc_roots(&reloc_roots);
2330         }
2331
2332         BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2333         return ret;
2334 }
2335
2336 static void free_block_list(struct rb_root *blocks)
2337 {
2338         struct tree_block *block;
2339         struct rb_node *rb_node;
2340         while ((rb_node = rb_first(blocks))) {
2341                 block = rb_entry(rb_node, struct tree_block, rb_node);
2342                 rb_erase(rb_node, blocks);
2343                 kfree(block);
2344         }
2345 }
2346
2347 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2348                                       struct btrfs_root *reloc_root)
2349 {
2350         struct btrfs_root *root;
2351
2352         if (reloc_root->last_trans == trans->transid)
2353                 return 0;
2354
2355         root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2356         BUG_ON(IS_ERR(root));
2357         BUG_ON(root->reloc_root != reloc_root);
2358
2359         return btrfs_record_root_in_trans(trans, root);
2360 }
2361
2362 static noinline_for_stack
2363 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2364                                      struct reloc_control *rc,
2365                                      struct backref_node *node,
2366                                      struct backref_edge *edges[], int *nr)
2367 {
2368         struct backref_node *next;
2369         struct btrfs_root *root;
2370         int index = 0;
2371
2372         next = node;
2373         while (1) {
2374                 cond_resched();
2375                 next = walk_up_backref(next, edges, &index);
2376                 root = next->root;
2377                 BUG_ON(!root);
2378                 BUG_ON(!root->ref_cows);
2379
2380                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2381                         record_reloc_root_in_trans(trans, root);
2382                         break;
2383                 }
2384
2385                 btrfs_record_root_in_trans(trans, root);
2386                 root = root->reloc_root;
2387
2388                 if (next->new_bytenr != root->node->start) {
2389                         BUG_ON(next->new_bytenr);
2390                         BUG_ON(!list_empty(&next->list));
2391                         next->new_bytenr = root->node->start;
2392                         next->root = root;
2393                         list_add_tail(&next->list,
2394                                       &rc->backref_cache.changed);
2395                         __mark_block_processed(rc, next);
2396                         break;
2397                 }
2398
2399                 WARN_ON(1);
2400                 root = NULL;
2401                 next = walk_down_backref(edges, &index);
2402                 if (!next || next->level <= node->level)
2403                         break;
2404         }
2405         if (!root)
2406                 return NULL;
2407
2408         *nr = index;
2409         next = node;
2410         /* setup backref node path for btrfs_reloc_cow_block */
2411         while (1) {
2412                 rc->backref_cache.path[next->level] = next;
2413                 if (--index < 0)
2414                         break;
2415                 next = edges[index]->node[UPPER];
2416         }
2417         return root;
2418 }
2419
2420 /*
2421  * select a tree root for relocation. return NULL if the block
2422  * is reference counted. we should use do_relocation() in this
2423  * case. return a tree root pointer if the block isn't reference
2424  * counted. return -ENOENT if the block is root of reloc tree.
2425  */
2426 static noinline_for_stack
2427 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2428                                    struct backref_node *node)
2429 {
2430         struct backref_node *next;
2431         struct btrfs_root *root;
2432         struct btrfs_root *fs_root = NULL;
2433         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2434         int index = 0;
2435
2436         next = node;
2437         while (1) {
2438                 cond_resched();
2439                 next = walk_up_backref(next, edges, &index);
2440                 root = next->root;
2441                 BUG_ON(!root);
2442
2443                 /* no other choice for non-references counted tree */
2444                 if (!root->ref_cows)
2445                         return root;
2446
2447                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2448                         fs_root = root;
2449
2450                 if (next != node)
2451                         return NULL;
2452
2453                 next = walk_down_backref(edges, &index);
2454                 if (!next || next->level <= node->level)
2455                         break;
2456         }
2457
2458         if (!fs_root)
2459                 return ERR_PTR(-ENOENT);
2460         return fs_root;
2461 }
2462
2463 static noinline_for_stack
2464 u64 calcu_metadata_size(struct reloc_control *rc,
2465                         struct backref_node *node, int reserve)
2466 {
2467         struct backref_node *next = node;
2468         struct backref_edge *edge;
2469         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2470         u64 num_bytes = 0;
2471         int index = 0;
2472
2473         BUG_ON(reserve && node->processed);
2474
2475         while (next) {
2476                 cond_resched();
2477                 while (1) {
2478                         if (next->processed && (reserve || next != node))
2479                                 break;
2480
2481                         num_bytes += btrfs_level_size(rc->extent_root,
2482                                                       next->level);
2483
2484                         if (list_empty(&next->upper))
2485                                 break;
2486
2487                         edge = list_entry(next->upper.next,
2488                                           struct backref_edge, list[LOWER]);
2489                         edges[index++] = edge;
2490                         next = edge->node[UPPER];
2491                 }
2492                 next = walk_down_backref(edges, &index);
2493         }
2494         return num_bytes;
2495 }
2496
2497 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2498                                   struct reloc_control *rc,
2499                                   struct backref_node *node)
2500 {
2501         struct btrfs_root *root = rc->extent_root;
2502         u64 num_bytes;
2503         int ret;
2504
2505         num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2506
2507         trans->block_rsv = rc->block_rsv;
2508         ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2509                                   BTRFS_RESERVE_FLUSH_ALL);
2510         if (ret) {
2511                 if (ret == -EAGAIN)
2512                         rc->commit_transaction = 1;
2513                 return ret;
2514         }
2515
2516         return 0;
2517 }
2518
2519 static void release_metadata_space(struct reloc_control *rc,
2520                                    struct backref_node *node)
2521 {
2522         u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2523         btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
2524 }
2525
2526 /*
2527  * relocate a block tree, and then update pointers in upper level
2528  * blocks that reference the block to point to the new location.
2529  *
2530  * if called by link_to_upper, the block has already been relocated.
2531  * in that case this function just updates pointers.
2532  */
2533 static int do_relocation(struct btrfs_trans_handle *trans,
2534                          struct reloc_control *rc,
2535                          struct backref_node *node,
2536                          struct btrfs_key *key,
2537                          struct btrfs_path *path, int lowest)
2538 {
2539         struct backref_node *upper;
2540         struct backref_edge *edge;
2541         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2542         struct btrfs_root *root;
2543         struct extent_buffer *eb;
2544         u32 blocksize;
2545         u64 bytenr;
2546         u64 generation;
2547         int nr;
2548         int slot;
2549         int ret;
2550         int err = 0;
2551
2552         BUG_ON(lowest && node->eb);
2553
2554         path->lowest_level = node->level + 1;
2555         rc->backref_cache.path[node->level] = node;
2556         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2557                 cond_resched();
2558
2559                 upper = edge->node[UPPER];
2560                 root = select_reloc_root(trans, rc, upper, edges, &nr);
2561                 BUG_ON(!root);
2562
2563                 if (upper->eb && !upper->locked) {
2564                         if (!lowest) {
2565                                 ret = btrfs_bin_search(upper->eb, key,
2566                                                        upper->level, &slot);
2567                                 BUG_ON(ret);
2568                                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2569                                 if (node->eb->start == bytenr)
2570                                         goto next;
2571                         }
2572                         drop_node_buffer(upper);
2573                 }
2574
2575                 if (!upper->eb) {
2576                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2577                         if (ret < 0) {
2578                                 err = ret;
2579                                 break;
2580                         }
2581                         BUG_ON(ret > 0);
2582
2583                         if (!upper->eb) {
2584                                 upper->eb = path->nodes[upper->level];
2585                                 path->nodes[upper->level] = NULL;
2586                         } else {
2587                                 BUG_ON(upper->eb != path->nodes[upper->level]);
2588                         }
2589
2590                         upper->locked = 1;
2591                         path->locks[upper->level] = 0;
2592
2593                         slot = path->slots[upper->level];
2594                         btrfs_release_path(path);
2595                 } else {
2596                         ret = btrfs_bin_search(upper->eb, key, upper->level,
2597                                                &slot);
2598                         BUG_ON(ret);
2599                 }
2600
2601                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2602                 if (lowest) {
2603                         BUG_ON(bytenr != node->bytenr);
2604                 } else {
2605                         if (node->eb->start == bytenr)
2606                                 goto next;
2607                 }
2608
2609                 blocksize = btrfs_level_size(root, node->level);
2610                 generation = btrfs_node_ptr_generation(upper->eb, slot);
2611                 eb = read_tree_block(root, bytenr, blocksize, generation);
2612                 if (!eb || !extent_buffer_uptodate(eb)) {
2613                         free_extent_buffer(eb);
2614                         err = -EIO;
2615                         goto next;
2616                 }
2617                 btrfs_tree_lock(eb);
2618                 btrfs_set_lock_blocking(eb);
2619
2620                 if (!node->eb) {
2621                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2622                                               slot, &eb);
2623                         btrfs_tree_unlock(eb);
2624                         free_extent_buffer(eb);
2625                         if (ret < 0) {
2626                                 err = ret;
2627                                 goto next;
2628                         }
2629                         BUG_ON(node->eb != eb);
2630                 } else {
2631                         btrfs_set_node_blockptr(upper->eb, slot,
2632                                                 node->eb->start);
2633                         btrfs_set_node_ptr_generation(upper->eb, slot,
2634                                                       trans->transid);
2635                         btrfs_mark_buffer_dirty(upper->eb);
2636
2637                         ret = btrfs_inc_extent_ref(trans, root,
2638                                                 node->eb->start, blocksize,
2639                                                 upper->eb->start,
2640                                                 btrfs_header_owner(upper->eb),
2641                                                 node->level, 0, 1);
2642                         BUG_ON(ret);
2643
2644                         ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2645                         BUG_ON(ret);
2646                 }
2647 next:
2648                 if (!upper->pending)
2649                         drop_node_buffer(upper);
2650                 else
2651                         unlock_node_buffer(upper);
2652                 if (err)
2653                         break;
2654         }
2655
2656         if (!err && node->pending) {
2657                 drop_node_buffer(node);
2658                 list_move_tail(&node->list, &rc->backref_cache.changed);
2659                 node->pending = 0;
2660         }
2661
2662         path->lowest_level = 0;
2663         BUG_ON(err == -ENOSPC);
2664         return err;
2665 }
2666
2667 static int link_to_upper(struct btrfs_trans_handle *trans,
2668                          struct reloc_control *rc,
2669                          struct backref_node *node,
2670                          struct btrfs_path *path)
2671 {
2672         struct btrfs_key key;
2673
2674         btrfs_node_key_to_cpu(node->eb, &key, 0);
2675         return do_relocation(trans, rc, node, &key, path, 0);
2676 }
2677
2678 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2679                                 struct reloc_control *rc,
2680                                 struct btrfs_path *path, int err)
2681 {
2682         LIST_HEAD(list);
2683         struct backref_cache *cache = &rc->backref_cache;
2684         struct backref_node *node;
2685         int level;
2686         int ret;
2687
2688         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2689                 while (!list_empty(&cache->pending[level])) {
2690                         node = list_entry(cache->pending[level].next,
2691                                           struct backref_node, list);
2692                         list_move_tail(&node->list, &list);
2693                         BUG_ON(!node->pending);
2694
2695                         if (!err) {
2696                                 ret = link_to_upper(trans, rc, node, path);
2697                                 if (ret < 0)
2698                                         err = ret;
2699                         }
2700                 }
2701                 list_splice_init(&list, &cache->pending[level]);
2702         }
2703         return err;
2704 }
2705
2706 static void mark_block_processed(struct reloc_control *rc,
2707                                  u64 bytenr, u32 blocksize)
2708 {
2709         set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2710                         EXTENT_DIRTY, GFP_NOFS);
2711 }
2712
2713 static void __mark_block_processed(struct reloc_control *rc,
2714                                    struct backref_node *node)
2715 {
2716         u32 blocksize;
2717         if (node->level == 0 ||
2718             in_block_group(node->bytenr, rc->block_group)) {
2719                 blocksize = btrfs_level_size(rc->extent_root, node->level);
2720                 mark_block_processed(rc, node->bytenr, blocksize);
2721         }
2722         node->processed = 1;
2723 }
2724
2725 /*
2726  * mark a block and all blocks directly/indirectly reference the block
2727  * as processed.
2728  */
2729 static void update_processed_blocks(struct reloc_control *rc,
2730                                     struct backref_node *node)
2731 {
2732         struct backref_node *next = node;
2733         struct backref_edge *edge;
2734         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2735         int index = 0;
2736
2737         while (next) {
2738                 cond_resched();
2739                 while (1) {
2740                         if (next->processed)
2741                                 break;
2742
2743                         __mark_block_processed(rc, next);
2744
2745                         if (list_empty(&next->upper))
2746                                 break;
2747
2748                         edge = list_entry(next->upper.next,
2749                                           struct backref_edge, list[LOWER]);
2750                         edges[index++] = edge;
2751                         next = edge->node[UPPER];
2752                 }
2753                 next = walk_down_backref(edges, &index);
2754         }
2755 }
2756
2757 static int tree_block_processed(u64 bytenr, u32 blocksize,
2758                                 struct reloc_control *rc)
2759 {
2760         if (test_range_bit(&rc->processed_blocks, bytenr,
2761                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2762                 return 1;
2763         return 0;
2764 }
2765
2766 static int get_tree_block_key(struct reloc_control *rc,
2767                               struct tree_block *block)
2768 {
2769         struct extent_buffer *eb;
2770
2771         BUG_ON(block->key_ready);
2772         eb = read_tree_block(rc->extent_root, block->bytenr,
2773                              block->key.objectid, block->key.offset);
2774         if (!eb || !extent_buffer_uptodate(eb)) {
2775                 free_extent_buffer(eb);
2776                 return -EIO;
2777         }
2778         WARN_ON(btrfs_header_level(eb) != block->level);
2779         if (block->level == 0)
2780                 btrfs_item_key_to_cpu(eb, &block->key, 0);
2781         else
2782                 btrfs_node_key_to_cpu(eb, &block->key, 0);
2783         free_extent_buffer(eb);
2784         block->key_ready = 1;
2785         return 0;
2786 }
2787
2788 static int reada_tree_block(struct reloc_control *rc,
2789                             struct tree_block *block)
2790 {
2791         BUG_ON(block->key_ready);
2792         if (block->key.type == BTRFS_METADATA_ITEM_KEY)
2793                 readahead_tree_block(rc->extent_root, block->bytenr,
2794                                      block->key.objectid,
2795                                      rc->extent_root->leafsize);
2796         else
2797                 readahead_tree_block(rc->extent_root, block->bytenr,
2798                                      block->key.objectid, block->key.offset);
2799         return 0;
2800 }
2801
2802 /*
2803  * helper function to relocate a tree block
2804  */
2805 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2806                                 struct reloc_control *rc,
2807                                 struct backref_node *node,
2808                                 struct btrfs_key *key,
2809                                 struct btrfs_path *path)
2810 {
2811         struct btrfs_root *root;
2812         int release = 0;
2813         int ret = 0;
2814
2815         if (!node)
2816                 return 0;
2817
2818         BUG_ON(node->processed);
2819         root = select_one_root(trans, node);
2820         if (root == ERR_PTR(-ENOENT)) {
2821                 update_processed_blocks(rc, node);
2822                 goto out;
2823         }
2824
2825         if (!root || root->ref_cows) {
2826                 ret = reserve_metadata_space(trans, rc, node);
2827                 if (ret)
2828                         goto out;
2829                 release = 1;
2830         }
2831
2832         if (root) {
2833                 if (root->ref_cows) {
2834                         BUG_ON(node->new_bytenr);
2835                         BUG_ON(!list_empty(&node->list));
2836                         btrfs_record_root_in_trans(trans, root);
2837                         root = root->reloc_root;
2838                         node->new_bytenr = root->node->start;
2839                         node->root = root;
2840                         list_add_tail(&node->list, &rc->backref_cache.changed);
2841                 } else {
2842                         path->lowest_level = node->level;
2843                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2844                         btrfs_release_path(path);
2845                         if (ret > 0)
2846                                 ret = 0;
2847                 }
2848                 if (!ret)
2849                         update_processed_blocks(rc, node);
2850         } else {
2851                 ret = do_relocation(trans, rc, node, key, path, 1);
2852         }
2853 out:
2854         if (ret || node->level == 0 || node->cowonly) {
2855                 if (release)
2856                         release_metadata_space(rc, node);
2857                 remove_backref_node(&rc->backref_cache, node);
2858         }
2859         return ret;
2860 }
2861
2862 /*
2863  * relocate a list of blocks
2864  */
2865 static noinline_for_stack
2866 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2867                          struct reloc_control *rc, struct rb_root *blocks)
2868 {
2869         struct backref_node *node;
2870         struct btrfs_path *path;
2871         struct tree_block *block;
2872         struct rb_node *rb_node;
2873         int ret;
2874         int err = 0;
2875
2876         path = btrfs_alloc_path();
2877         if (!path) {
2878                 err = -ENOMEM;
2879                 goto out_path;
2880         }
2881
2882         rb_node = rb_first(blocks);
2883         while (rb_node) {
2884                 block = rb_entry(rb_node, struct tree_block, rb_node);
2885                 if (!block->key_ready)
2886                         reada_tree_block(rc, block);
2887                 rb_node = rb_next(rb_node);
2888         }
2889
2890         rb_node = rb_first(blocks);
2891         while (rb_node) {
2892                 block = rb_entry(rb_node, struct tree_block, rb_node);
2893                 if (!block->key_ready)
2894                         get_tree_block_key(rc, block);
2895                 rb_node = rb_next(rb_node);
2896         }
2897
2898         rb_node = rb_first(blocks);
2899         while (rb_node) {
2900                 block = rb_entry(rb_node, struct tree_block, rb_node);
2901
2902                 node = build_backref_tree(rc, &block->key,
2903                                           block->level, block->bytenr);
2904                 if (IS_ERR(node)) {
2905                         err = PTR_ERR(node);
2906                         goto out;
2907                 }
2908
2909                 ret = relocate_tree_block(trans, rc, node, &block->key,
2910                                           path);
2911                 if (ret < 0) {
2912                         if (ret != -EAGAIN || rb_node == rb_first(blocks))
2913                                 err = ret;
2914                         goto out;
2915                 }
2916                 rb_node = rb_next(rb_node);
2917         }
2918 out:
2919         err = finish_pending_nodes(trans, rc, path, err);
2920
2921         btrfs_free_path(path);
2922 out_path:
2923         free_block_list(blocks);
2924         return err;
2925 }
2926
2927 static noinline_for_stack
2928 int prealloc_file_extent_cluster(struct inode *inode,
2929                                  struct file_extent_cluster *cluster)
2930 {
2931         u64 alloc_hint = 0;
2932         u64 start;
2933         u64 end;
2934         u64 offset = BTRFS_I(inode)->index_cnt;
2935         u64 num_bytes;
2936         int nr = 0;
2937         int ret = 0;
2938
2939         BUG_ON(cluster->start != cluster->boundary[0]);
2940         mutex_lock(&inode->i_mutex);
2941
2942         ret = btrfs_check_data_free_space(inode, cluster->end +
2943                                           1 - cluster->start);
2944         if (ret)
2945                 goto out;
2946
2947         while (nr < cluster->nr) {
2948                 start = cluster->boundary[nr] - offset;
2949                 if (nr + 1 < cluster->nr)
2950                         end = cluster->boundary[nr + 1] - 1 - offset;
2951                 else
2952                         end = cluster->end - offset;
2953
2954                 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2955                 num_bytes = end + 1 - start;
2956                 ret = btrfs_prealloc_file_range(inode, 0, start,
2957                                                 num_bytes, num_bytes,
2958                                                 end + 1, &alloc_hint);
2959                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2960                 if (ret)
2961                         break;
2962                 nr++;
2963         }
2964         btrfs_free_reserved_data_space(inode, cluster->end +
2965                                        1 - cluster->start);
2966 out:
2967         mutex_unlock(&inode->i_mutex);
2968         return ret;
2969 }
2970
2971 static noinline_for_stack
2972 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2973                          u64 block_start)
2974 {
2975         struct btrfs_root *root = BTRFS_I(inode)->root;
2976         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2977         struct extent_map *em;
2978         int ret = 0;
2979
2980         em = alloc_extent_map();
2981         if (!em)
2982                 return -ENOMEM;
2983
2984         em->start = start;
2985         em->len = end + 1 - start;
2986         em->block_len = em->len;
2987         em->block_start = block_start;
2988         em->bdev = root->fs_info->fs_devices->latest_bdev;
2989         set_bit(EXTENT_FLAG_PINNED, &em->flags);
2990
2991         lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2992         while (1) {
2993                 write_lock(&em_tree->lock);
2994                 ret = add_extent_mapping(em_tree, em, 0);
2995                 write_unlock(&em_tree->lock);
2996                 if (ret != -EEXIST) {
2997                         free_extent_map(em);
2998                         break;
2999                 }
3000                 btrfs_drop_extent_cache(inode, start, end, 0);
3001         }
3002         unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3003         return ret;
3004 }
3005
3006 static int relocate_file_extent_cluster(struct inode *inode,
3007                                         struct file_extent_cluster *cluster)
3008 {
3009         u64 page_start;
3010         u64 page_end;
3011         u64 offset = BTRFS_I(inode)->index_cnt;
3012         unsigned long index;
3013         unsigned long last_index;
3014         struct page *page;
3015         struct file_ra_state *ra;
3016         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3017         int nr = 0;
3018         int ret = 0;
3019
3020         if (!cluster->nr)
3021                 return 0;
3022
3023         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3024         if (!ra)
3025                 return -ENOMEM;
3026
3027         ret = prealloc_file_extent_cluster(inode, cluster);
3028         if (ret)
3029                 goto out;
3030
3031         file_ra_state_init(ra, inode->i_mapping);
3032
3033         ret = setup_extent_mapping(inode, cluster->start - offset,
3034                                    cluster->end - offset, cluster->start);
3035         if (ret)
3036                 goto out;
3037
3038         index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
3039         last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
3040         while (index <= last_index) {
3041                 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
3042                 if (ret)
3043                         goto out;
3044
3045                 page = find_lock_page(inode->i_mapping, index);
3046                 if (!page) {
3047                         page_cache_sync_readahead(inode->i_mapping,
3048                                                   ra, NULL, index,
3049                                                   last_index + 1 - index);
3050                         page = find_or_create_page(inode->i_mapping, index,
3051                                                    mask);
3052                         if (!page) {
3053                                 btrfs_delalloc_release_metadata(inode,
3054                                                         PAGE_CACHE_SIZE);
3055                                 ret = -ENOMEM;
3056                                 goto out;
3057                         }
3058                 }
3059
3060                 if (PageReadahead(page)) {
3061                         page_cache_async_readahead(inode->i_mapping,
3062                                                    ra, NULL, page, index,
3063                                                    last_index + 1 - index);
3064                 }
3065
3066                 if (!PageUptodate(page)) {
3067                         btrfs_readpage(NULL, page);
3068                         lock_page(page);
3069                         if (!PageUptodate(page)) {
3070                                 unlock_page(page);
3071                                 page_cache_release(page);
3072                                 btrfs_delalloc_release_metadata(inode,
3073                                                         PAGE_CACHE_SIZE);
3074                                 ret = -EIO;
3075                                 goto out;
3076                         }
3077                 }
3078
3079                 page_start = page_offset(page);
3080                 page_end = page_start + PAGE_CACHE_SIZE - 1;
3081
3082                 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3083
3084                 set_page_extent_mapped(page);
3085
3086                 if (nr < cluster->nr &&
3087                     page_start + offset == cluster->boundary[nr]) {
3088                         set_extent_bits(&BTRFS_I(inode)->io_tree,
3089                                         page_start, page_end,
3090                                         EXTENT_BOUNDARY, GFP_NOFS);
3091                         nr++;
3092                 }
3093
3094                 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
3095                 set_page_dirty(page);
3096
3097                 unlock_extent(&BTRFS_I(inode)->io_tree,
3098                               page_start, page_end);
3099                 unlock_page(page);
3100                 page_cache_release(page);
3101
3102                 index++;
3103                 balance_dirty_pages_ratelimited(inode->i_mapping);
3104                 btrfs_throttle(BTRFS_I(inode)->root);
3105         }
3106         WARN_ON(nr != cluster->nr);
3107 out:
3108         kfree(ra);
3109         return ret;
3110 }
3111
3112 static noinline_for_stack
3113 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3114                          struct file_extent_cluster *cluster)
3115 {
3116         int ret;
3117
3118         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3119                 ret = relocate_file_extent_cluster(inode, cluster);
3120                 if (ret)
3121                         return ret;
3122                 cluster->nr = 0;
3123         }
3124
3125         if (!cluster->nr)
3126                 cluster->start = extent_key->objectid;
3127         else
3128                 BUG_ON(cluster->nr >= MAX_EXTENTS);
3129         cluster->end = extent_key->objectid + extent_key->offset - 1;
3130         cluster->boundary[cluster->nr] = extent_key->objectid;
3131         cluster->nr++;
3132
3133         if (cluster->nr >= MAX_EXTENTS) {
3134                 ret = relocate_file_extent_cluster(inode, cluster);
3135                 if (ret)
3136                         return ret;
3137                 cluster->nr = 0;
3138         }
3139         return 0;
3140 }
3141
3142 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3143 static int get_ref_objectid_v0(struct reloc_control *rc,
3144                                struct btrfs_path *path,
3145                                struct btrfs_key *extent_key,
3146                                u64 *ref_objectid, int *path_change)
3147 {
3148         struct btrfs_key key;
3149         struct extent_buffer *leaf;
3150         struct btrfs_extent_ref_v0 *ref0;
3151         int ret;
3152         int slot;
3153
3154         leaf = path->nodes[0];
3155         slot = path->slots[0];
3156         while (1) {
3157                 if (slot >= btrfs_header_nritems(leaf)) {
3158                         ret = btrfs_next_leaf(rc->extent_root, path);
3159                         if (ret < 0)
3160                                 return ret;
3161                         BUG_ON(ret > 0);
3162                         leaf = path->nodes[0];
3163                         slot = path->slots[0];
3164                         if (path_change)
3165                                 *path_change = 1;
3166                 }
3167                 btrfs_item_key_to_cpu(leaf, &key, slot);
3168                 if (key.objectid != extent_key->objectid)
3169                         return -ENOENT;
3170
3171                 if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3172                         slot++;
3173                         continue;
3174                 }
3175                 ref0 = btrfs_item_ptr(leaf, slot,
3176                                 struct btrfs_extent_ref_v0);
3177                 *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3178                 break;
3179         }
3180         return 0;
3181 }
3182 #endif
3183
3184 /*
3185  * helper to add a tree block to the list.
3186  * the major work is getting the generation and level of the block
3187  */
3188 static int add_tree_block(struct reloc_control *rc,
3189                           struct btrfs_key *extent_key,
3190                           struct btrfs_path *path,
3191                           struct rb_root *blocks)
3192 {
3193         struct extent_buffer *eb;
3194         struct btrfs_extent_item *ei;
3195         struct btrfs_tree_block_info *bi;
3196         struct tree_block *block;
3197         struct rb_node *rb_node;
3198         u32 item_size;
3199         int level = -1;
3200         int generation;
3201
3202         eb =  path->nodes[0];
3203         item_size = btrfs_item_size_nr(eb, path->slots[0]);
3204
3205         if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3206             item_size >= sizeof(*ei) + sizeof(*bi)) {
3207                 ei = btrfs_item_ptr(eb, path->slots[0],
3208                                 struct btrfs_extent_item);
3209                 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3210                         bi = (struct btrfs_tree_block_info *)(ei + 1);
3211                         level = btrfs_tree_block_level(eb, bi);
3212                 } else {
3213                         level = (int)extent_key->offset;
3214                 }
3215                 generation = btrfs_extent_generation(eb, ei);
3216         } else {
3217 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3218                 u64 ref_owner;
3219                 int ret;
3220
3221                 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3222                 ret = get_ref_objectid_v0(rc, path, extent_key,
3223                                           &ref_owner, NULL);
3224                 if (ret < 0)
3225                         return ret;
3226                 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3227                 level = (int)ref_owner;
3228                 /* FIXME: get real generation */
3229                 generation = 0;
3230 #else
3231                 BUG();
3232 #endif
3233         }
3234
3235         btrfs_release_path(path);
3236
3237         BUG_ON(level == -1);
3238
3239         block = kmalloc(sizeof(*block), GFP_NOFS);
3240         if (!block)
3241                 return -ENOMEM;
3242
3243         block->bytenr = extent_key->objectid;
3244         block->key.objectid = rc->extent_root->leafsize;
3245         block->key.offset = generation;
3246         block->level = level;
3247         block->key_ready = 0;
3248
3249         rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3250         if (rb_node)
3251                 backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3252
3253         return 0;
3254 }
3255
3256 /*
3257  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3258  */
3259 static int __add_tree_block(struct reloc_control *rc,
3260                             u64 bytenr, u32 blocksize,
3261                             struct rb_root *blocks)
3262 {
3263         struct btrfs_path *path;
3264         struct btrfs_key key;
3265         int ret;
3266
3267         if (tree_block_processed(bytenr, blocksize, rc))
3268                 return 0;
3269
3270         if (tree_search(blocks, bytenr))
3271                 return 0;
3272
3273         path = btrfs_alloc_path();
3274         if (!path)
3275                 return -ENOMEM;
3276
3277         key.objectid = bytenr;
3278         key.type = BTRFS_EXTENT_ITEM_KEY;
3279         key.offset = blocksize;
3280
3281         path->search_commit_root = 1;
3282         path->skip_locking = 1;
3283         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3284         if (ret < 0)
3285                 goto out;
3286
3287         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3288         if (ret > 0) {
3289                 if (key.objectid == bytenr &&
3290                     key.type == BTRFS_METADATA_ITEM_KEY)
3291                         ret = 0;
3292         }
3293         BUG_ON(ret);
3294
3295         ret = add_tree_block(rc, &key, path, blocks);
3296 out:
3297         btrfs_free_path(path);
3298         return ret;
3299 }
3300
3301 /*
3302  * helper to check if the block use full backrefs for pointers in it
3303  */
3304 static int block_use_full_backref(struct reloc_control *rc,
3305                                   struct extent_buffer *eb)
3306 {
3307         u64 flags;
3308         int ret;
3309
3310         if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3311             btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3312                 return 1;
3313
3314         ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3315                                        eb->start, btrfs_header_level(eb), 1,
3316                                        NULL, &flags);
3317         BUG_ON(ret);
3318
3319         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3320                 ret = 1;
3321         else
3322                 ret = 0;
3323         return ret;
3324 }
3325
3326 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3327                                     struct inode *inode, u64 ino)
3328 {
3329         struct btrfs_key key;
3330         struct btrfs_path *path;
3331         struct btrfs_root *root = fs_info->tree_root;
3332         struct btrfs_trans_handle *trans;
3333         int ret = 0;
3334
3335         if (inode)
3336                 goto truncate;
3337
3338         key.objectid = ino;
3339         key.type = BTRFS_INODE_ITEM_KEY;
3340         key.offset = 0;
3341
3342         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3343         if (IS_ERR(inode) || is_bad_inode(inode)) {
3344                 if (!IS_ERR(inode))
3345                         iput(inode);
3346                 return -ENOENT;
3347         }
3348
3349 truncate:
3350         path = btrfs_alloc_path();
3351         if (!path) {
3352                 ret = -ENOMEM;
3353                 goto out;
3354         }
3355
3356         trans = btrfs_join_transaction(root);
3357         if (IS_ERR(trans)) {
3358                 btrfs_free_path(path);
3359                 ret = PTR_ERR(trans);
3360                 goto out;
3361         }
3362
3363         ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3364
3365         btrfs_free_path(path);
3366         btrfs_end_transaction(trans, root);
3367         btrfs_btree_balance_dirty(root);
3368 out:
3369         iput(inode);
3370         return ret;
3371 }
3372
3373 /*
3374  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3375  * this function scans fs tree to find blocks reference the data extent
3376  */
3377 static int find_data_references(struct reloc_control *rc,
3378                                 struct btrfs_key *extent_key,
3379                                 struct extent_buffer *leaf,
3380                                 struct btrfs_extent_data_ref *ref,
3381                                 struct rb_root *blocks)
3382 {
3383         struct btrfs_path *path;
3384         struct tree_block *block;
3385         struct btrfs_root *root;
3386         struct btrfs_file_extent_item *fi;
3387         struct rb_node *rb_node;
3388         struct btrfs_key key;
3389         u64 ref_root;
3390         u64 ref_objectid;
3391         u64 ref_offset;
3392         u32 ref_count;
3393         u32 nritems;
3394         int err = 0;
3395         int added = 0;
3396         int counted;
3397         int ret;
3398
3399         ref_root = btrfs_extent_data_ref_root(leaf, ref);
3400         ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3401         ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3402         ref_count = btrfs_extent_data_ref_count(leaf, ref);
3403
3404         /*
3405          * This is an extent belonging to the free space cache, lets just delete
3406          * it and redo the search.
3407          */
3408         if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3409                 ret = delete_block_group_cache(rc->extent_root->fs_info,
3410                                                NULL, ref_objectid);
3411                 if (ret != -ENOENT)
3412                         return ret;
3413                 ret = 0;
3414         }
3415
3416         path = btrfs_alloc_path();
3417         if (!path)
3418                 return -ENOMEM;
3419         path->reada = 1;
3420
3421         root = read_fs_root(rc->extent_root->fs_info, ref_root);
3422         if (IS_ERR(root)) {
3423                 err = PTR_ERR(root);
3424                 goto out;
3425         }
3426
3427         key.objectid = ref_objectid;
3428         key.type = BTRFS_EXTENT_DATA_KEY;
3429         if (ref_offset > ((u64)-1 << 32))
3430                 key.offset = 0;
3431         else
3432                 key.offset = ref_offset;
3433
3434         path->search_commit_root = 1;
3435         path->skip_locking = 1;
3436         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3437         if (ret < 0) {
3438                 err = ret;
3439                 goto out;
3440         }
3441
3442         leaf = path->nodes[0];
3443         nritems = btrfs_header_nritems(leaf);
3444         /*
3445          * the references in tree blocks that use full backrefs
3446          * are not counted in
3447          */
3448         if (block_use_full_backref(rc, leaf))
3449                 counted = 0;
3450         else
3451                 counted = 1;
3452         rb_node = tree_search(blocks, leaf->start);
3453         if (rb_node) {
3454                 if (counted)
3455                         added = 1;
3456                 else
3457                         path->slots[0] = nritems;
3458         }
3459
3460         while (ref_count > 0) {
3461                 while (path->slots[0] >= nritems) {
3462                         ret = btrfs_next_leaf(root, path);
3463                         if (ret < 0) {
3464                                 err = ret;
3465                                 goto out;
3466                         }
3467                         if (ret > 0) {
3468                                 WARN_ON(1);
3469                                 goto out;
3470                         }
3471
3472                         leaf = path->nodes[0];
3473                         nritems = btrfs_header_nritems(leaf);
3474                         added = 0;
3475
3476                         if (block_use_full_backref(rc, leaf))
3477                                 counted = 0;
3478                         else
3479                                 counted = 1;
3480                         rb_node = tree_search(blocks, leaf->start);
3481                         if (rb_node) {
3482                                 if (counted)
3483                                         added = 1;
3484                                 else
3485                                         path->slots[0] = nritems;
3486                         }
3487                 }
3488
3489                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3490                 if (key.objectid != ref_objectid ||
3491                     key.type != BTRFS_EXTENT_DATA_KEY) {
3492                         WARN_ON(1);
3493                         break;
3494                 }
3495
3496                 fi = btrfs_item_ptr(leaf, path->slots[0],
3497                                     struct btrfs_file_extent_item);
3498
3499                 if (btrfs_file_extent_type(leaf, fi) ==
3500                     BTRFS_FILE_EXTENT_INLINE)
3501                         goto next;
3502
3503                 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3504                     extent_key->objectid)
3505                         goto next;
3506
3507                 key.offset -= btrfs_file_extent_offset(leaf, fi);
3508                 if (key.offset != ref_offset)
3509                         goto next;
3510
3511                 if (counted)
3512                         ref_count--;
3513                 if (added)
3514                         goto next;
3515
3516                 if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3517                         block = kmalloc(sizeof(*block), GFP_NOFS);
3518                         if (!block) {
3519                                 err = -ENOMEM;
3520                                 break;
3521                         }
3522                         block->bytenr = leaf->start;
3523                         btrfs_item_key_to_cpu(leaf, &block->key, 0);
3524                         block->level = 0;
3525                         block->key_ready = 1;
3526                         rb_node = tree_insert(blocks, block->bytenr,
3527                                               &block->rb_node);
3528                         if (rb_node)
3529                                 backref_tree_panic(rb_node, -EEXIST,
3530                                                    block->bytenr);
3531                 }
3532                 if (counted)
3533                         added = 1;
3534                 else
3535                         path->slots[0] = nritems;
3536 next:
3537                 path->slots[0]++;
3538
3539         }
3540 out:
3541         btrfs_free_path(path);
3542         return err;
3543 }
3544
3545 /*
3546  * helper to find all tree blocks that reference a given data extent
3547  */
3548 static noinline_for_stack
3549 int add_data_references(struct reloc_control *rc,
3550                         struct btrfs_key *extent_key,
3551                         struct btrfs_path *path,
3552                         struct rb_root *blocks)
3553 {
3554         struct btrfs_key key;
3555         struct extent_buffer *eb;
3556         struct btrfs_extent_data_ref *dref;
3557         struct btrfs_extent_inline_ref *iref;
3558         unsigned long ptr;
3559         unsigned long end;
3560         u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3561         int ret;
3562         int err = 0;
3563
3564         eb = path->nodes[0];
3565         ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3566         end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3567 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3568         if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3569                 ptr = end;
3570         else
3571 #endif
3572                 ptr += sizeof(struct btrfs_extent_item);
3573
3574         while (ptr < end) {
3575                 iref = (struct btrfs_extent_inline_ref *)ptr;
3576                 key.type = btrfs_extent_inline_ref_type(eb, iref);
3577                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3578                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3579                         ret = __add_tree_block(rc, key.offset, blocksize,
3580                                                blocks);
3581                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3582                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3583                         ret = find_data_references(rc, extent_key,
3584                                                    eb, dref, blocks);
3585                 } else {
3586                         BUG();
3587                 }
3588                 ptr += btrfs_extent_inline_ref_size(key.type);
3589         }
3590         WARN_ON(ptr > end);
3591
3592         while (1) {
3593                 cond_resched();
3594                 eb = path->nodes[0];
3595                 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3596                         ret = btrfs_next_leaf(rc->extent_root, path);
3597                         if (ret < 0) {
3598                                 err = ret;
3599                                 break;
3600                         }
3601                         if (ret > 0)
3602                                 break;
3603                         eb = path->nodes[0];
3604                 }
3605
3606                 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3607                 if (key.objectid != extent_key->objectid)
3608                         break;
3609
3610 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3611                 if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3612                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
3613 #else
3614                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3615                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3616 #endif
3617                         ret = __add_tree_block(rc, key.offset, blocksize,
3618                                                blocks);
3619                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3620                         dref = btrfs_item_ptr(eb, path->slots[0],
3621                                               struct btrfs_extent_data_ref);
3622                         ret = find_data_references(rc, extent_key,
3623                                                    eb, dref, blocks);
3624                 } else {
3625                         ret = 0;
3626                 }
3627                 if (ret) {
3628                         err = ret;
3629                         break;
3630                 }
3631                 path->slots[0]++;
3632         }
3633         btrfs_release_path(path);
3634         if (err)
3635                 free_block_list(blocks);
3636         return err;
3637 }
3638
3639 /*
3640  * helper to find next unprocessed extent
3641  */
3642 static noinline_for_stack
3643 int find_next_extent(struct btrfs_trans_handle *trans,
3644                      struct reloc_control *rc, struct btrfs_path *path,
3645                      struct btrfs_key *extent_key)
3646 {
3647         struct btrfs_key key;
3648         struct extent_buffer *leaf;
3649         u64 start, end, last;
3650         int ret;
3651
3652         last = rc->block_group->key.objectid + rc->block_group->key.offset;
3653         while (1) {
3654                 cond_resched();
3655                 if (rc->search_start >= last) {
3656                         ret = 1;
3657                         break;
3658                 }
3659
3660                 key.objectid = rc->search_start;
3661                 key.type = BTRFS_EXTENT_ITEM_KEY;
3662                 key.offset = 0;
3663
3664                 path->search_commit_root = 1;
3665                 path->skip_locking = 1;
3666                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3667                                         0, 0);
3668                 if (ret < 0)
3669                         break;
3670 next:
3671                 leaf = path->nodes[0];
3672                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3673                         ret = btrfs_next_leaf(rc->extent_root, path);
3674                         if (ret != 0)
3675                                 break;
3676                         leaf = path->nodes[0];
3677                 }
3678
3679                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3680                 if (key.objectid >= last) {
3681                         ret = 1;
3682                         break;
3683                 }
3684
3685                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3686                     key.type != BTRFS_METADATA_ITEM_KEY) {
3687                         path->slots[0]++;
3688                         goto next;
3689                 }
3690
3691                 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3692                     key.objectid + key.offset <= rc->search_start) {
3693                         path->slots[0]++;
3694                         goto next;
3695                 }
3696
3697                 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3698                     key.objectid + rc->extent_root->leafsize <=
3699                     rc->search_start) {
3700                         path->slots[0]++;
3701                         goto next;
3702                 }
3703
3704                 ret = find_first_extent_bit(&rc->processed_blocks,
3705                                             key.objectid, &start, &end,
3706                                             EXTENT_DIRTY, NULL);
3707
3708                 if (ret == 0 && start <= key.objectid) {
3709                         btrfs_release_path(path);
3710                         rc->search_start = end + 1;
3711                 } else {
3712                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
3713                                 rc->search_start = key.objectid + key.offset;
3714                         else
3715                                 rc->search_start = key.objectid +
3716                                         rc->extent_root->leafsize;
3717                         memcpy(extent_key, &key, sizeof(key));
3718                         return 0;
3719                 }
3720         }
3721         btrfs_release_path(path);
3722         return ret;
3723 }
3724
3725 static void set_reloc_control(struct reloc_control *rc)
3726 {
3727         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3728
3729         mutex_lock(&fs_info->reloc_mutex);
3730         fs_info->reloc_ctl = rc;
3731         mutex_unlock(&fs_info->reloc_mutex);
3732 }
3733
3734 static void unset_reloc_control(struct reloc_control *rc)
3735 {
3736         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3737
3738         mutex_lock(&fs_info->reloc_mutex);
3739         fs_info->reloc_ctl = NULL;
3740         mutex_unlock(&fs_info->reloc_mutex);
3741 }
3742
3743 static int check_extent_flags(u64 flags)
3744 {
3745         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3746             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3747                 return 1;
3748         if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3749             !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3750                 return 1;
3751         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3752             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3753                 return 1;
3754         return 0;
3755 }
3756
3757 static noinline_for_stack
3758 int prepare_to_relocate(struct reloc_control *rc)
3759 {
3760         struct btrfs_trans_handle *trans;
3761         int ret;
3762
3763         rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3764                                               BTRFS_BLOCK_RSV_TEMP);
3765         if (!rc->block_rsv)
3766                 return -ENOMEM;
3767
3768         /*
3769          * reserve some space for creating reloc trees.
3770          * btrfs_init_reloc_root will use them when there
3771          * is no reservation in transaction handle.
3772          */
3773         ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3774                                   rc->extent_root->nodesize * 256,
3775                                   BTRFS_RESERVE_FLUSH_ALL);
3776         if (ret)
3777                 return ret;
3778
3779         memset(&rc->cluster, 0, sizeof(rc->cluster));
3780         rc->search_start = rc->block_group->key.objectid;
3781         rc->extents_found = 0;
3782         rc->nodes_relocated = 0;
3783         rc->merging_rsv_size = 0;
3784
3785         rc->create_reloc_tree = 1;
3786         set_reloc_control(rc);
3787
3788         trans = btrfs_join_transaction(rc->extent_root);
3789         if (IS_ERR(trans)) {
3790                 unset_reloc_control(rc);
3791                 /*
3792                  * extent tree is not a ref_cow tree and has no reloc_root to
3793                  * cleanup.  And callers are responsible to free the above
3794                  * block rsv.
3795                  */
3796                 return PTR_ERR(trans);
3797         }
3798         btrfs_commit_transaction(trans, rc->extent_root);
3799         return 0;
3800 }
3801
3802 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3803 {
3804         struct rb_root blocks = RB_ROOT;
3805         struct btrfs_key key;
3806         struct btrfs_trans_handle *trans = NULL;
3807         struct btrfs_path *path;
3808         struct btrfs_extent_item *ei;
3809         u64 flags;
3810         u32 item_size;
3811         int ret;
3812         int err = 0;
3813         int progress = 0;
3814
3815         path = btrfs_alloc_path();
3816         if (!path)
3817                 return -ENOMEM;
3818         path->reada = 1;
3819
3820         ret = prepare_to_relocate(rc);
3821         if (ret) {
3822                 err = ret;
3823                 goto out_free;
3824         }
3825
3826         while (1) {
3827                 progress++;
3828                 trans = btrfs_start_transaction(rc->extent_root, 0);
3829                 if (IS_ERR(trans)) {
3830                         err = PTR_ERR(trans);
3831                         trans = NULL;
3832                         break;
3833                 }
3834 restart:
3835                 if (update_backref_cache(trans, &rc->backref_cache)) {
3836                         btrfs_end_transaction(trans, rc->extent_root);
3837                         continue;
3838                 }
3839
3840                 ret = find_next_extent(trans, rc, path, &key);
3841                 if (ret < 0)
3842                         err = ret;
3843                 if (ret != 0)
3844                         break;
3845
3846                 rc->extents_found++;
3847
3848                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3849                                     struct btrfs_extent_item);
3850                 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3851                 if (item_size >= sizeof(*ei)) {
3852                         flags = btrfs_extent_flags(path->nodes[0], ei);
3853                         ret = check_extent_flags(flags);
3854                         BUG_ON(ret);
3855
3856                 } else {
3857 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3858                         u64 ref_owner;
3859                         int path_change = 0;
3860
3861                         BUG_ON(item_size !=
3862                                sizeof(struct btrfs_extent_item_v0));
3863                         ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3864                                                   &path_change);
3865                         if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3866                                 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3867                         else
3868                                 flags = BTRFS_EXTENT_FLAG_DATA;
3869
3870                         if (path_change) {
3871                                 btrfs_release_path(path);
3872
3873                                 path->search_commit_root = 1;
3874                                 path->skip_locking = 1;
3875                                 ret = btrfs_search_slot(NULL, rc->extent_root,
3876                                                         &key, path, 0, 0);
3877                                 if (ret < 0) {
3878                                         err = ret;
3879                                         break;
3880                                 }
3881                                 BUG_ON(ret > 0);
3882                         }
3883 #else
3884                         BUG();
3885 #endif
3886                 }
3887
3888                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3889                         ret = add_tree_block(rc, &key, path, &blocks);
3890                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3891                            (flags & BTRFS_EXTENT_FLAG_DATA)) {
3892                         ret = add_data_references(rc, &key, path, &blocks);
3893                 } else {
3894                         btrfs_release_path(path);
3895                         ret = 0;
3896                 }
3897                 if (ret < 0) {
3898                         err = ret;
3899                         break;
3900                 }
3901
3902                 if (!RB_EMPTY_ROOT(&blocks)) {
3903                         ret = relocate_tree_blocks(trans, rc, &blocks);
3904                         if (ret < 0) {
3905                                 if (ret != -EAGAIN) {
3906                                         err = ret;
3907                                         break;
3908                                 }
3909                                 rc->extents_found--;
3910                                 rc->search_start = key.objectid;
3911                         }
3912                 }
3913
3914                 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3915                 if (ret < 0) {
3916                         if (ret != -ENOSPC) {
3917                                 err = ret;
3918                                 WARN_ON(1);
3919                                 break;
3920                         }
3921                         rc->commit_transaction = 1;
3922                 }
3923
3924                 if (rc->commit_transaction) {
3925                         rc->commit_transaction = 0;
3926                         ret = btrfs_commit_transaction(trans, rc->extent_root);
3927                         BUG_ON(ret);
3928                 } else {
3929                         btrfs_end_transaction_throttle(trans, rc->extent_root);
3930                         btrfs_btree_balance_dirty(rc->extent_root);
3931                 }
3932                 trans = NULL;
3933
3934                 if (rc->stage == MOVE_DATA_EXTENTS &&
3935                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
3936                         rc->found_file_extent = 1;
3937                         ret = relocate_data_extent(rc->data_inode,
3938                                                    &key, &rc->cluster);
3939                         if (ret < 0) {
3940                                 err = ret;
3941                                 break;
3942                         }
3943                 }
3944         }
3945         if (trans && progress && err == -ENOSPC) {
3946                 ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
3947                                               rc->block_group->flags);
3948                 if (ret == 0) {
3949                         err = 0;
3950                         progress = 0;
3951                         goto restart;
3952                 }
3953         }
3954
3955         btrfs_release_path(path);
3956         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3957                           GFP_NOFS);
3958
3959         if (trans) {
3960                 btrfs_end_transaction_throttle(trans, rc->extent_root);
3961                 btrfs_btree_balance_dirty(rc->extent_root);
3962         }
3963
3964         if (!err) {
3965                 ret = relocate_file_extent_cluster(rc->data_inode,
3966                                                    &rc->cluster);
3967                 if (ret < 0)
3968                         err = ret;
3969         }
3970
3971         rc->create_reloc_tree = 0;
3972         set_reloc_control(rc);
3973
3974         backref_cache_cleanup(&rc->backref_cache);
3975         btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3976
3977         err = prepare_to_merge(rc, err);
3978
3979         merge_reloc_roots(rc);
3980
3981         rc->merge_reloc_tree = 0;
3982         unset_reloc_control(rc);
3983         btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3984
3985         /* get rid of pinned extents */
3986         trans = btrfs_join_transaction(rc->extent_root);
3987         if (IS_ERR(trans))
3988                 err = PTR_ERR(trans);
3989         else
3990                 btrfs_commit_transaction(trans, rc->extent_root);
3991 out_free:
3992         btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3993         btrfs_free_path(path);
3994         return err;
3995 }
3996
3997 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3998                                  struct btrfs_root *root, u64 objectid)
3999 {
4000         struct btrfs_path *path;
4001         struct btrfs_inode_item *item;
4002         struct extent_buffer *leaf;
4003         int ret;
4004
4005         path = btrfs_alloc_path();
4006         if (!path)
4007                 return -ENOMEM;
4008
4009         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4010         if (ret)
4011                 goto out;
4012
4013         leaf = path->nodes[0];
4014         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4015         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4016         btrfs_set_inode_generation(leaf, item, 1);
4017         btrfs_set_inode_size(leaf, item, 0);
4018         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4019         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4020                                           BTRFS_INODE_PREALLOC);
4021         btrfs_mark_buffer_dirty(leaf);
4022         btrfs_release_path(path);
4023 out:
4024         btrfs_free_path(path);
4025         return ret;
4026 }
4027
4028 /*
4029  * helper to create inode for data relocation.
4030  * the inode is in data relocation tree and its link count is 0
4031  */
4032 static noinline_for_stack
4033 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4034                                  struct btrfs_block_group_cache *group)
4035 {
4036         struct inode *inode = NULL;
4037         struct btrfs_trans_handle *trans;
4038         struct btrfs_root *root;
4039         struct btrfs_key key;
4040         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
4041         int err = 0;
4042
4043         root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4044         if (IS_ERR(root))
4045                 return ERR_CAST(root);
4046
4047         trans = btrfs_start_transaction(root, 6);
4048         if (IS_ERR(trans))
4049                 return ERR_CAST(trans);
4050
4051         err = btrfs_find_free_objectid(root, &objectid);
4052         if (err)
4053                 goto out;
4054
4055         err = __insert_orphan_inode(trans, root, objectid);
4056         BUG_ON(err);
4057
4058         key.objectid = objectid;
4059         key.type = BTRFS_INODE_ITEM_KEY;
4060         key.offset = 0;
4061         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4062         BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4063         BTRFS_I(inode)->index_cnt = group->key.objectid;
4064
4065         err = btrfs_orphan_add(trans, inode);
4066 out:
4067         btrfs_end_transaction(trans, root);
4068         btrfs_btree_balance_dirty(root);
4069         if (err) {
4070                 if (inode)
4071                         iput(inode);
4072                 inode = ERR_PTR(err);
4073         }
4074         return inode;
4075 }
4076
4077 static struct reloc_control *alloc_reloc_control(void)
4078 {
4079         struct reloc_control *rc;
4080
4081         rc = kzalloc(sizeof(*rc), GFP_NOFS);
4082         if (!rc)
4083                 return NULL;
4084
4085         INIT_LIST_HEAD(&rc->reloc_roots);
4086         backref_cache_init(&rc->backref_cache);
4087         mapping_tree_init(&rc->reloc_root_tree);
4088         extent_io_tree_init(&rc->processed_blocks, NULL);
4089         return rc;
4090 }
4091
4092 /*
4093  * function to relocate all extents in a block group.
4094  */
4095 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4096 {
4097         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4098         struct reloc_control *rc;
4099         struct inode *inode;
4100         struct btrfs_path *path;
4101         int ret;
4102         int rw = 0;
4103         int err = 0;
4104
4105         rc = alloc_reloc_control();
4106         if (!rc)
4107                 return -ENOMEM;
4108
4109         rc->extent_root = extent_root;
4110
4111         rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4112         BUG_ON(!rc->block_group);
4113
4114         if (!rc->block_group->ro) {
4115                 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
4116                 if (ret) {
4117                         err = ret;
4118                         goto out;
4119                 }
4120                 rw = 1;
4121         }
4122
4123         path = btrfs_alloc_path();
4124         if (!path) {
4125                 err = -ENOMEM;
4126                 goto out;
4127         }
4128
4129         inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4130                                         path);
4131         btrfs_free_path(path);
4132
4133         if (!IS_ERR(inode))
4134                 ret = delete_block_group_cache(fs_info, inode, 0);
4135         else
4136                 ret = PTR_ERR(inode);
4137
4138         if (ret && ret != -ENOENT) {
4139                 err = ret;
4140                 goto out;
4141         }
4142
4143         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4144         if (IS_ERR(rc->data_inode)) {
4145                 err = PTR_ERR(rc->data_inode);
4146                 rc->data_inode = NULL;
4147                 goto out;
4148         }
4149
4150         printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
4151                (unsigned long long)rc->block_group->key.objectid,
4152                (unsigned long long)rc->block_group->flags);
4153
4154         ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4155         if (ret < 0) {
4156                 err = ret;
4157                 goto out;
4158         }
4159         btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4160
4161         while (1) {
4162                 mutex_lock(&fs_info->cleaner_mutex);
4163                 ret = relocate_block_group(rc);
4164                 mutex_unlock(&fs_info->cleaner_mutex);
4165                 if (ret < 0) {
4166                         err = ret;
4167                         goto out;
4168                 }
4169
4170                 if (rc->extents_found == 0)
4171                         break;
4172
4173                 printk(KERN_INFO "btrfs: found %llu extents\n",
4174                         (unsigned long long)rc->extents_found);
4175
4176                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4177                         btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
4178                         invalidate_mapping_pages(rc->data_inode->i_mapping,
4179                                                  0, -1);
4180                         rc->stage = UPDATE_DATA_PTRS;
4181                 }
4182         }
4183
4184         filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
4185                                      rc->block_group->key.objectid,
4186                                      rc->block_group->key.objectid +
4187                                      rc->block_group->key.offset - 1);
4188
4189         WARN_ON(rc->block_group->pinned > 0);
4190         WARN_ON(rc->block_group->reserved > 0);
4191         WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4192 out:
4193         if (err && rw)
4194                 btrfs_set_block_group_rw(extent_root, rc->block_group);
4195         iput(rc->data_inode);
4196         btrfs_put_block_group(rc->block_group);
4197         kfree(rc);
4198         return err;
4199 }
4200
4201 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4202 {
4203         struct btrfs_trans_handle *trans;
4204         int ret, err;
4205
4206         trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4207         if (IS_ERR(trans))
4208                 return PTR_ERR(trans);
4209
4210         memset(&root->root_item.drop_progress, 0,
4211                 sizeof(root->root_item.drop_progress));
4212         root->root_item.drop_level = 0;
4213         btrfs_set_root_refs(&root->root_item, 0);
4214         ret = btrfs_update_root(trans, root->fs_info->tree_root,
4215                                 &root->root_key, &root->root_item);
4216
4217         err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4218         if (err)
4219                 return err;
4220         return ret;
4221 }
4222
4223 /*
4224  * recover relocation interrupted by system crash.
4225  *
4226  * this function resumes merging reloc trees with corresponding fs trees.
4227  * this is important for keeping the sharing of tree blocks
4228  */
4229 int btrfs_recover_relocation(struct btrfs_root *root)
4230 {
4231         LIST_HEAD(reloc_roots);
4232         struct btrfs_key key;
4233         struct btrfs_root *fs_root;
4234         struct btrfs_root *reloc_root;
4235         struct btrfs_path *path;
4236         struct extent_buffer *leaf;
4237         struct reloc_control *rc = NULL;
4238         struct btrfs_trans_handle *trans;
4239         int ret;
4240         int err = 0;
4241
4242         path = btrfs_alloc_path();
4243         if (!path)
4244                 return -ENOMEM;
4245         path->reada = -1;
4246
4247         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4248         key.type = BTRFS_ROOT_ITEM_KEY;
4249         key.offset = (u64)-1;
4250
4251         while (1) {
4252                 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4253                                         path, 0, 0);
4254                 if (ret < 0) {
4255                         err = ret;
4256                         goto out;
4257                 }
4258                 if (ret > 0) {
4259                         if (path->slots[0] == 0)
4260                                 break;
4261                         path->slots[0]--;
4262                 }
4263                 leaf = path->nodes[0];
4264                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4265                 btrfs_release_path(path);
4266
4267                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4268                     key.type != BTRFS_ROOT_ITEM_KEY)
4269                         break;
4270
4271                 reloc_root = btrfs_read_fs_root_no_radix(root, &key);
4272                 if (IS_ERR(reloc_root)) {
4273                         err = PTR_ERR(reloc_root);
4274                         goto out;
4275                 }
4276
4277                 list_add(&reloc_root->root_list, &reloc_roots);
4278
4279                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4280                         fs_root = read_fs_root(root->fs_info,
4281                                                reloc_root->root_key.offset);
4282                         if (IS_ERR(fs_root)) {
4283                                 ret = PTR_ERR(fs_root);
4284                                 if (ret != -ENOENT) {
4285                                         err = ret;
4286                                         goto out;
4287                                 }
4288                                 ret = mark_garbage_root(reloc_root);
4289                                 if (ret < 0) {
4290                                         err = ret;
4291                                         goto out;
4292                                 }
4293                         }
4294                 }
4295
4296                 if (key.offset == 0)
4297                         break;
4298
4299                 key.offset--;
4300         }
4301         btrfs_release_path(path);
4302
4303         if (list_empty(&reloc_roots))
4304                 goto out;
4305
4306         rc = alloc_reloc_control();
4307         if (!rc) {
4308                 err = -ENOMEM;
4309                 goto out;
4310         }
4311
4312         rc->extent_root = root->fs_info->extent_root;
4313
4314         set_reloc_control(rc);
4315
4316         trans = btrfs_join_transaction(rc->extent_root);
4317         if (IS_ERR(trans)) {
4318                 unset_reloc_control(rc);
4319                 err = PTR_ERR(trans);
4320                 goto out_free;
4321         }
4322
4323         rc->merge_reloc_tree = 1;
4324
4325         while (!list_empty(&reloc_roots)) {
4326                 reloc_root = list_entry(reloc_roots.next,
4327                                         struct btrfs_root, root_list);
4328                 list_del(&reloc_root->root_list);
4329
4330                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4331                         list_add_tail(&reloc_root->root_list,
4332                                       &rc->reloc_roots);
4333                         continue;
4334                 }
4335
4336                 fs_root = read_fs_root(root->fs_info,
4337                                        reloc_root->root_key.offset);
4338                 if (IS_ERR(fs_root)) {
4339                         err = PTR_ERR(fs_root);
4340                         goto out_free;
4341                 }
4342
4343                 err = __add_reloc_root(reloc_root);
4344                 BUG_ON(err < 0); /* -ENOMEM or logic error */
4345                 fs_root->reloc_root = reloc_root;
4346         }
4347
4348         err = btrfs_commit_transaction(trans, rc->extent_root);
4349         if (err)
4350                 goto out_free;
4351
4352         merge_reloc_roots(rc);
4353
4354         unset_reloc_control(rc);
4355
4356         trans = btrfs_join_transaction(rc->extent_root);
4357         if (IS_ERR(trans))
4358                 err = PTR_ERR(trans);
4359         else
4360                 err = btrfs_commit_transaction(trans, rc->extent_root);
4361 out_free:
4362         kfree(rc);
4363 out:
4364         if (!list_empty(&reloc_roots))
4365                 free_reloc_roots(&reloc_roots);
4366
4367         btrfs_free_path(path);
4368
4369         if (err == 0) {
4370                 /* cleanup orphan inode in data relocation tree */
4371                 fs_root = read_fs_root(root->fs_info,
4372                                        BTRFS_DATA_RELOC_TREE_OBJECTID);
4373                 if (IS_ERR(fs_root))
4374                         err = PTR_ERR(fs_root);
4375                 else
4376                         err = btrfs_orphan_cleanup(fs_root);
4377         }
4378         return err;
4379 }
4380
4381 /*
4382  * helper to add ordered checksum for data relocation.
4383  *
4384  * cloning checksum properly handles the nodatasum extents.
4385  * it also saves CPU time to re-calculate the checksum.
4386  */
4387 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4388 {
4389         struct btrfs_ordered_sum *sums;
4390         struct btrfs_sector_sum *sector_sum;
4391         struct btrfs_ordered_extent *ordered;
4392         struct btrfs_root *root = BTRFS_I(inode)->root;
4393         size_t offset;
4394         int ret;
4395         u64 disk_bytenr;
4396         LIST_HEAD(list);
4397
4398         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4399         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4400
4401         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4402         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4403                                        disk_bytenr + len - 1, &list, 0);
4404         if (ret)
4405                 goto out;
4406
4407         while (!list_empty(&list)) {
4408                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4409                 list_del_init(&sums->list);
4410
4411                 sector_sum = sums->sums;
4412                 sums->bytenr = ordered->start;
4413
4414                 offset = 0;
4415                 while (offset < sums->len) {
4416                         sector_sum->bytenr += ordered->start - disk_bytenr;
4417                         sector_sum++;
4418                         offset += root->sectorsize;
4419                 }
4420
4421                 btrfs_add_ordered_sum(inode, ordered, sums);
4422         }
4423 out:
4424         btrfs_put_ordered_extent(ordered);
4425         return ret;
4426 }
4427
4428 void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4429                            struct btrfs_root *root, struct extent_buffer *buf,
4430                            struct extent_buffer *cow)
4431 {
4432         struct reloc_control *rc;
4433         struct backref_node *node;
4434         int first_cow = 0;
4435         int level;
4436         int ret;
4437
4438         rc = root->fs_info->reloc_ctl;
4439         if (!rc)
4440                 return;
4441
4442         BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4443                root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4444
4445         level = btrfs_header_level(buf);
4446         if (btrfs_header_generation(buf) <=
4447             btrfs_root_last_snapshot(&root->root_item))
4448                 first_cow = 1;
4449
4450         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4451             rc->create_reloc_tree) {
4452                 WARN_ON(!first_cow && level == 0);
4453
4454                 node = rc->backref_cache.path[level];
4455                 BUG_ON(node->bytenr != buf->start &&
4456                        node->new_bytenr != buf->start);
4457
4458                 drop_node_buffer(node);
4459                 extent_buffer_get(cow);
4460                 node->eb = cow;
4461                 node->new_bytenr = cow->start;
4462
4463                 if (!node->pending) {
4464                         list_move_tail(&node->list,
4465                                        &rc->backref_cache.pending[level]);
4466                         node->pending = 1;
4467                 }
4468
4469                 if (first_cow)
4470                         __mark_block_processed(rc, node);
4471
4472                 if (first_cow && level > 0)
4473                         rc->nodes_relocated += buf->len;
4474         }
4475
4476         if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4477                 ret = replace_file_extents(trans, rc, root, cow);
4478                 BUG_ON(ret);
4479         }
4480 }
4481
4482 /*
4483  * called before creating snapshot. it calculates metadata reservation
4484  * requried for relocating tree blocks in the snapshot
4485  */
4486 void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4487                               struct btrfs_pending_snapshot *pending,
4488                               u64 *bytes_to_reserve)
4489 {
4490         struct btrfs_root *root;
4491         struct reloc_control *rc;
4492
4493         root = pending->root;
4494         if (!root->reloc_root)
4495                 return;
4496
4497         rc = root->fs_info->reloc_ctl;
4498         if (!rc->merge_reloc_tree)
4499                 return;
4500
4501         root = root->reloc_root;
4502         BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4503         /*
4504          * relocation is in the stage of merging trees. the space
4505          * used by merging a reloc tree is twice the size of
4506          * relocated tree nodes in the worst case. half for cowing
4507          * the reloc tree, half for cowing the fs tree. the space
4508          * used by cowing the reloc tree will be freed after the
4509          * tree is dropped. if we create snapshot, cowing the fs
4510          * tree may use more space than it frees. so we need
4511          * reserve extra space.
4512          */
4513         *bytes_to_reserve += rc->nodes_relocated;
4514 }
4515
4516 /*
4517  * called after snapshot is created. migrate block reservation
4518  * and create reloc root for the newly created snapshot
4519  */
4520 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4521                                struct btrfs_pending_snapshot *pending)
4522 {
4523         struct btrfs_root *root = pending->root;
4524         struct btrfs_root *reloc_root;
4525         struct btrfs_root *new_root;
4526         struct reloc_control *rc;
4527         int ret;
4528
4529         if (!root->reloc_root)
4530                 return 0;
4531
4532         rc = root->fs_info->reloc_ctl;
4533         rc->merging_rsv_size += rc->nodes_relocated;
4534
4535         if (rc->merge_reloc_tree) {
4536                 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4537                                               rc->block_rsv,
4538                                               rc->nodes_relocated);
4539                 if (ret)
4540                         return ret;
4541         }
4542
4543         new_root = pending->snap;
4544         reloc_root = create_reloc_root(trans, root->reloc_root,
4545                                        new_root->root_key.objectid);
4546         if (IS_ERR(reloc_root))
4547                 return PTR_ERR(reloc_root);
4548
4549         ret = __add_reloc_root(reloc_root);
4550         BUG_ON(ret < 0);
4551         new_root->reloc_root = reloc_root;
4552
4553         if (rc->create_reloc_tree)
4554                 ret = clone_backref_node(trans, rc, root, reloc_root);
4555         return ret;
4556 }