]> rtime.felk.cvut.cz Git - linux-imx.git/blob - fs/btrfs/extent-tree.c
Btrfs: return EIO if we have extent tree corruption
[linux-imx.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "raid56.h"
35 #include "locking.h"
36 #include "free-space-cache.h"
37 #include "math.h"
38
39 #undef SCRAMBLE_DELAYED_REFS
40
41 /*
42  * control flags for do_chunk_alloc's force field
43  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
44  * if we really need one.
45  *
46  * CHUNK_ALLOC_LIMITED means to only try and allocate one
47  * if we have very few chunks already allocated.  This is
48  * used as part of the clustering code to help make sure
49  * we have a good pool of storage to cluster in, without
50  * filling the FS with empty chunks
51  *
52  * CHUNK_ALLOC_FORCE means it must try to allocate one
53  *
54  */
55 enum {
56         CHUNK_ALLOC_NO_FORCE = 0,
57         CHUNK_ALLOC_LIMITED = 1,
58         CHUNK_ALLOC_FORCE = 2,
59 };
60
61 /*
62  * Control how reservations are dealt with.
63  *
64  * RESERVE_FREE - freeing a reservation.
65  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
66  *   ENOSPC accounting
67  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
68  *   bytes_may_use as the ENOSPC accounting is done elsewhere
69  */
70 enum {
71         RESERVE_FREE = 0,
72         RESERVE_ALLOC = 1,
73         RESERVE_ALLOC_NO_ACCOUNT = 2,
74 };
75
76 static int update_block_group(struct btrfs_root *root,
77                               u64 bytenr, u64 num_bytes, int alloc);
78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79                                 struct btrfs_root *root,
80                                 u64 bytenr, u64 num_bytes, u64 parent,
81                                 u64 root_objectid, u64 owner_objectid,
82                                 u64 owner_offset, int refs_to_drop,
83                                 struct btrfs_delayed_extent_op *extra_op);
84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
85                                     struct extent_buffer *leaf,
86                                     struct btrfs_extent_item *ei);
87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
88                                       struct btrfs_root *root,
89                                       u64 parent, u64 root_objectid,
90                                       u64 flags, u64 owner, u64 offset,
91                                       struct btrfs_key *ins, int ref_mod);
92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
93                                      struct btrfs_root *root,
94                                      u64 parent, u64 root_objectid,
95                                      u64 flags, struct btrfs_disk_key *key,
96                                      int level, struct btrfs_key *ins);
97 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
98                           struct btrfs_root *extent_root, u64 flags,
99                           int force);
100 static int find_next_key(struct btrfs_path *path, int level,
101                          struct btrfs_key *key);
102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103                             int dump_block_groups);
104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105                                        u64 num_bytes, int reserve);
106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107                                u64 num_bytes);
108
109 static noinline int
110 block_group_cache_done(struct btrfs_block_group_cache *cache)
111 {
112         smp_mb();
113         return cache->cached == BTRFS_CACHE_FINISHED;
114 }
115
116 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
117 {
118         return (cache->flags & bits) == bits;
119 }
120
121 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
122 {
123         atomic_inc(&cache->count);
124 }
125
126 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
127 {
128         if (atomic_dec_and_test(&cache->count)) {
129                 WARN_ON(cache->pinned > 0);
130                 WARN_ON(cache->reserved > 0);
131                 kfree(cache->free_space_ctl);
132                 kfree(cache);
133         }
134 }
135
136 /*
137  * this adds the block group to the fs_info rb tree for the block group
138  * cache
139  */
140 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
141                                 struct btrfs_block_group_cache *block_group)
142 {
143         struct rb_node **p;
144         struct rb_node *parent = NULL;
145         struct btrfs_block_group_cache *cache;
146
147         spin_lock(&info->block_group_cache_lock);
148         p = &info->block_group_cache_tree.rb_node;
149
150         while (*p) {
151                 parent = *p;
152                 cache = rb_entry(parent, struct btrfs_block_group_cache,
153                                  cache_node);
154                 if (block_group->key.objectid < cache->key.objectid) {
155                         p = &(*p)->rb_left;
156                 } else if (block_group->key.objectid > cache->key.objectid) {
157                         p = &(*p)->rb_right;
158                 } else {
159                         spin_unlock(&info->block_group_cache_lock);
160                         return -EEXIST;
161                 }
162         }
163
164         rb_link_node(&block_group->cache_node, parent, p);
165         rb_insert_color(&block_group->cache_node,
166                         &info->block_group_cache_tree);
167
168         if (info->first_logical_byte > block_group->key.objectid)
169                 info->first_logical_byte = block_group->key.objectid;
170
171         spin_unlock(&info->block_group_cache_lock);
172
173         return 0;
174 }
175
176 /*
177  * This will return the block group at or after bytenr if contains is 0, else
178  * it will return the block group that contains the bytenr
179  */
180 static struct btrfs_block_group_cache *
181 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
182                               int contains)
183 {
184         struct btrfs_block_group_cache *cache, *ret = NULL;
185         struct rb_node *n;
186         u64 end, start;
187
188         spin_lock(&info->block_group_cache_lock);
189         n = info->block_group_cache_tree.rb_node;
190
191         while (n) {
192                 cache = rb_entry(n, struct btrfs_block_group_cache,
193                                  cache_node);
194                 end = cache->key.objectid + cache->key.offset - 1;
195                 start = cache->key.objectid;
196
197                 if (bytenr < start) {
198                         if (!contains && (!ret || start < ret->key.objectid))
199                                 ret = cache;
200                         n = n->rb_left;
201                 } else if (bytenr > start) {
202                         if (contains && bytenr <= end) {
203                                 ret = cache;
204                                 break;
205                         }
206                         n = n->rb_right;
207                 } else {
208                         ret = cache;
209                         break;
210                 }
211         }
212         if (ret) {
213                 btrfs_get_block_group(ret);
214                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215                         info->first_logical_byte = ret->key.objectid;
216         }
217         spin_unlock(&info->block_group_cache_lock);
218
219         return ret;
220 }
221
222 static int add_excluded_extent(struct btrfs_root *root,
223                                u64 start, u64 num_bytes)
224 {
225         u64 end = start + num_bytes - 1;
226         set_extent_bits(&root->fs_info->freed_extents[0],
227                         start, end, EXTENT_UPTODATE, GFP_NOFS);
228         set_extent_bits(&root->fs_info->freed_extents[1],
229                         start, end, EXTENT_UPTODATE, GFP_NOFS);
230         return 0;
231 }
232
233 static void free_excluded_extents(struct btrfs_root *root,
234                                   struct btrfs_block_group_cache *cache)
235 {
236         u64 start, end;
237
238         start = cache->key.objectid;
239         end = start + cache->key.offset - 1;
240
241         clear_extent_bits(&root->fs_info->freed_extents[0],
242                           start, end, EXTENT_UPTODATE, GFP_NOFS);
243         clear_extent_bits(&root->fs_info->freed_extents[1],
244                           start, end, EXTENT_UPTODATE, GFP_NOFS);
245 }
246
247 static int exclude_super_stripes(struct btrfs_root *root,
248                                  struct btrfs_block_group_cache *cache)
249 {
250         u64 bytenr;
251         u64 *logical;
252         int stripe_len;
253         int i, nr, ret;
254
255         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
256                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
257                 cache->bytes_super += stripe_len;
258                 ret = add_excluded_extent(root, cache->key.objectid,
259                                           stripe_len);
260                 BUG_ON(ret); /* -ENOMEM */
261         }
262
263         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
264                 bytenr = btrfs_sb_offset(i);
265                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
266                                        cache->key.objectid, bytenr,
267                                        0, &logical, &nr, &stripe_len);
268                 BUG_ON(ret); /* -ENOMEM */
269
270                 while (nr--) {
271                         cache->bytes_super += stripe_len;
272                         ret = add_excluded_extent(root, logical[nr],
273                                                   stripe_len);
274                         BUG_ON(ret); /* -ENOMEM */
275                 }
276
277                 kfree(logical);
278         }
279         return 0;
280 }
281
282 static struct btrfs_caching_control *
283 get_caching_control(struct btrfs_block_group_cache *cache)
284 {
285         struct btrfs_caching_control *ctl;
286
287         spin_lock(&cache->lock);
288         if (cache->cached != BTRFS_CACHE_STARTED) {
289                 spin_unlock(&cache->lock);
290                 return NULL;
291         }
292
293         /* We're loading it the fast way, so we don't have a caching_ctl. */
294         if (!cache->caching_ctl) {
295                 spin_unlock(&cache->lock);
296                 return NULL;
297         }
298
299         ctl = cache->caching_ctl;
300         atomic_inc(&ctl->count);
301         spin_unlock(&cache->lock);
302         return ctl;
303 }
304
305 static void put_caching_control(struct btrfs_caching_control *ctl)
306 {
307         if (atomic_dec_and_test(&ctl->count))
308                 kfree(ctl);
309 }
310
311 /*
312  * this is only called by cache_block_group, since we could have freed extents
313  * we need to check the pinned_extents for any extents that can't be used yet
314  * since their free space will be released as soon as the transaction commits.
315  */
316 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
317                               struct btrfs_fs_info *info, u64 start, u64 end)
318 {
319         u64 extent_start, extent_end, size, total_added = 0;
320         int ret;
321
322         while (start < end) {
323                 ret = find_first_extent_bit(info->pinned_extents, start,
324                                             &extent_start, &extent_end,
325                                             EXTENT_DIRTY | EXTENT_UPTODATE,
326                                             NULL);
327                 if (ret)
328                         break;
329
330                 if (extent_start <= start) {
331                         start = extent_end + 1;
332                 } else if (extent_start > start && extent_start < end) {
333                         size = extent_start - start;
334                         total_added += size;
335                         ret = btrfs_add_free_space(block_group, start,
336                                                    size);
337                         BUG_ON(ret); /* -ENOMEM or logic error */
338                         start = extent_end + 1;
339                 } else {
340                         break;
341                 }
342         }
343
344         if (start < end) {
345                 size = end - start;
346                 total_added += size;
347                 ret = btrfs_add_free_space(block_group, start, size);
348                 BUG_ON(ret); /* -ENOMEM or logic error */
349         }
350
351         return total_added;
352 }
353
354 static noinline void caching_thread(struct btrfs_work *work)
355 {
356         struct btrfs_block_group_cache *block_group;
357         struct btrfs_fs_info *fs_info;
358         struct btrfs_caching_control *caching_ctl;
359         struct btrfs_root *extent_root;
360         struct btrfs_path *path;
361         struct extent_buffer *leaf;
362         struct btrfs_key key;
363         u64 total_found = 0;
364         u64 last = 0;
365         u32 nritems;
366         int ret = 0;
367
368         caching_ctl = container_of(work, struct btrfs_caching_control, work);
369         block_group = caching_ctl->block_group;
370         fs_info = block_group->fs_info;
371         extent_root = fs_info->extent_root;
372
373         path = btrfs_alloc_path();
374         if (!path)
375                 goto out;
376
377         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
378
379         /*
380          * We don't want to deadlock with somebody trying to allocate a new
381          * extent for the extent root while also trying to search the extent
382          * root to add free space.  So we skip locking and search the commit
383          * root, since its read-only
384          */
385         path->skip_locking = 1;
386         path->search_commit_root = 1;
387         path->reada = 1;
388
389         key.objectid = last;
390         key.offset = 0;
391         key.type = BTRFS_EXTENT_ITEM_KEY;
392 again:
393         mutex_lock(&caching_ctl->mutex);
394         /* need to make sure the commit_root doesn't disappear */
395         down_read(&fs_info->extent_commit_sem);
396
397         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
398         if (ret < 0)
399                 goto err;
400
401         leaf = path->nodes[0];
402         nritems = btrfs_header_nritems(leaf);
403
404         while (1) {
405                 if (btrfs_fs_closing(fs_info) > 1) {
406                         last = (u64)-1;
407                         break;
408                 }
409
410                 if (path->slots[0] < nritems) {
411                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
412                 } else {
413                         ret = find_next_key(path, 0, &key);
414                         if (ret)
415                                 break;
416
417                         if (need_resched() ||
418                             btrfs_next_leaf(extent_root, path)) {
419                                 caching_ctl->progress = last;
420                                 btrfs_release_path(path);
421                                 up_read(&fs_info->extent_commit_sem);
422                                 mutex_unlock(&caching_ctl->mutex);
423                                 cond_resched();
424                                 goto again;
425                         }
426                         leaf = path->nodes[0];
427                         nritems = btrfs_header_nritems(leaf);
428                         continue;
429                 }
430
431                 if (key.objectid < block_group->key.objectid) {
432                         path->slots[0]++;
433                         continue;
434                 }
435
436                 if (key.objectid >= block_group->key.objectid +
437                     block_group->key.offset)
438                         break;
439
440                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
441                         total_found += add_new_free_space(block_group,
442                                                           fs_info, last,
443                                                           key.objectid);
444                         last = key.objectid + key.offset;
445
446                         if (total_found > (1024 * 1024 * 2)) {
447                                 total_found = 0;
448                                 wake_up(&caching_ctl->wait);
449                         }
450                 }
451                 path->slots[0]++;
452         }
453         ret = 0;
454
455         total_found += add_new_free_space(block_group, fs_info, last,
456                                           block_group->key.objectid +
457                                           block_group->key.offset);
458         caching_ctl->progress = (u64)-1;
459
460         spin_lock(&block_group->lock);
461         block_group->caching_ctl = NULL;
462         block_group->cached = BTRFS_CACHE_FINISHED;
463         spin_unlock(&block_group->lock);
464
465 err:
466         btrfs_free_path(path);
467         up_read(&fs_info->extent_commit_sem);
468
469         free_excluded_extents(extent_root, block_group);
470
471         mutex_unlock(&caching_ctl->mutex);
472 out:
473         wake_up(&caching_ctl->wait);
474
475         put_caching_control(caching_ctl);
476         btrfs_put_block_group(block_group);
477 }
478
479 static int cache_block_group(struct btrfs_block_group_cache *cache,
480                              int load_cache_only)
481 {
482         DEFINE_WAIT(wait);
483         struct btrfs_fs_info *fs_info = cache->fs_info;
484         struct btrfs_caching_control *caching_ctl;
485         int ret = 0;
486
487         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
488         if (!caching_ctl)
489                 return -ENOMEM;
490
491         INIT_LIST_HEAD(&caching_ctl->list);
492         mutex_init(&caching_ctl->mutex);
493         init_waitqueue_head(&caching_ctl->wait);
494         caching_ctl->block_group = cache;
495         caching_ctl->progress = cache->key.objectid;
496         atomic_set(&caching_ctl->count, 1);
497         caching_ctl->work.func = caching_thread;
498
499         spin_lock(&cache->lock);
500         /*
501          * This should be a rare occasion, but this could happen I think in the
502          * case where one thread starts to load the space cache info, and then
503          * some other thread starts a transaction commit which tries to do an
504          * allocation while the other thread is still loading the space cache
505          * info.  The previous loop should have kept us from choosing this block
506          * group, but if we've moved to the state where we will wait on caching
507          * block groups we need to first check if we're doing a fast load here,
508          * so we can wait for it to finish, otherwise we could end up allocating
509          * from a block group who's cache gets evicted for one reason or
510          * another.
511          */
512         while (cache->cached == BTRFS_CACHE_FAST) {
513                 struct btrfs_caching_control *ctl;
514
515                 ctl = cache->caching_ctl;
516                 atomic_inc(&ctl->count);
517                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
518                 spin_unlock(&cache->lock);
519
520                 schedule();
521
522                 finish_wait(&ctl->wait, &wait);
523                 put_caching_control(ctl);
524                 spin_lock(&cache->lock);
525         }
526
527         if (cache->cached != BTRFS_CACHE_NO) {
528                 spin_unlock(&cache->lock);
529                 kfree(caching_ctl);
530                 return 0;
531         }
532         WARN_ON(cache->caching_ctl);
533         cache->caching_ctl = caching_ctl;
534         cache->cached = BTRFS_CACHE_FAST;
535         spin_unlock(&cache->lock);
536
537         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
538                 ret = load_free_space_cache(fs_info, cache);
539
540                 spin_lock(&cache->lock);
541                 if (ret == 1) {
542                         cache->caching_ctl = NULL;
543                         cache->cached = BTRFS_CACHE_FINISHED;
544                         cache->last_byte_to_unpin = (u64)-1;
545                 } else {
546                         if (load_cache_only) {
547                                 cache->caching_ctl = NULL;
548                                 cache->cached = BTRFS_CACHE_NO;
549                         } else {
550                                 cache->cached = BTRFS_CACHE_STARTED;
551                         }
552                 }
553                 spin_unlock(&cache->lock);
554                 wake_up(&caching_ctl->wait);
555                 if (ret == 1) {
556                         put_caching_control(caching_ctl);
557                         free_excluded_extents(fs_info->extent_root, cache);
558                         return 0;
559                 }
560         } else {
561                 /*
562                  * We are not going to do the fast caching, set cached to the
563                  * appropriate value and wakeup any waiters.
564                  */
565                 spin_lock(&cache->lock);
566                 if (load_cache_only) {
567                         cache->caching_ctl = NULL;
568                         cache->cached = BTRFS_CACHE_NO;
569                 } else {
570                         cache->cached = BTRFS_CACHE_STARTED;
571                 }
572                 spin_unlock(&cache->lock);
573                 wake_up(&caching_ctl->wait);
574         }
575
576         if (load_cache_only) {
577                 put_caching_control(caching_ctl);
578                 return 0;
579         }
580
581         down_write(&fs_info->extent_commit_sem);
582         atomic_inc(&caching_ctl->count);
583         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
584         up_write(&fs_info->extent_commit_sem);
585
586         btrfs_get_block_group(cache);
587
588         btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
589
590         return ret;
591 }
592
593 /*
594  * return the block group that starts at or after bytenr
595  */
596 static struct btrfs_block_group_cache *
597 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
598 {
599         struct btrfs_block_group_cache *cache;
600
601         cache = block_group_cache_tree_search(info, bytenr, 0);
602
603         return cache;
604 }
605
606 /*
607  * return the block group that contains the given bytenr
608  */
609 struct btrfs_block_group_cache *btrfs_lookup_block_group(
610                                                  struct btrfs_fs_info *info,
611                                                  u64 bytenr)
612 {
613         struct btrfs_block_group_cache *cache;
614
615         cache = block_group_cache_tree_search(info, bytenr, 1);
616
617         return cache;
618 }
619
620 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
621                                                   u64 flags)
622 {
623         struct list_head *head = &info->space_info;
624         struct btrfs_space_info *found;
625
626         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
627
628         rcu_read_lock();
629         list_for_each_entry_rcu(found, head, list) {
630                 if (found->flags & flags) {
631                         rcu_read_unlock();
632                         return found;
633                 }
634         }
635         rcu_read_unlock();
636         return NULL;
637 }
638
639 /*
640  * after adding space to the filesystem, we need to clear the full flags
641  * on all the space infos.
642  */
643 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
644 {
645         struct list_head *head = &info->space_info;
646         struct btrfs_space_info *found;
647
648         rcu_read_lock();
649         list_for_each_entry_rcu(found, head, list)
650                 found->full = 0;
651         rcu_read_unlock();
652 }
653
654 u64 btrfs_find_block_group(struct btrfs_root *root,
655                            u64 search_start, u64 search_hint, int owner)
656 {
657         struct btrfs_block_group_cache *cache;
658         u64 used;
659         u64 last = max(search_hint, search_start);
660         u64 group_start = 0;
661         int full_search = 0;
662         int factor = 9;
663         int wrapped = 0;
664 again:
665         while (1) {
666                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
667                 if (!cache)
668                         break;
669
670                 spin_lock(&cache->lock);
671                 last = cache->key.objectid + cache->key.offset;
672                 used = btrfs_block_group_used(&cache->item);
673
674                 if ((full_search || !cache->ro) &&
675                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
676                         if (used + cache->pinned + cache->reserved <
677                             div_factor(cache->key.offset, factor)) {
678                                 group_start = cache->key.objectid;
679                                 spin_unlock(&cache->lock);
680                                 btrfs_put_block_group(cache);
681                                 goto found;
682                         }
683                 }
684                 spin_unlock(&cache->lock);
685                 btrfs_put_block_group(cache);
686                 cond_resched();
687         }
688         if (!wrapped) {
689                 last = search_start;
690                 wrapped = 1;
691                 goto again;
692         }
693         if (!full_search && factor < 10) {
694                 last = search_start;
695                 full_search = 1;
696                 factor = 10;
697                 goto again;
698         }
699 found:
700         return group_start;
701 }
702
703 /* simple helper to search for an existing extent at a given offset */
704 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
705 {
706         int ret;
707         struct btrfs_key key;
708         struct btrfs_path *path;
709
710         path = btrfs_alloc_path();
711         if (!path)
712                 return -ENOMEM;
713
714         key.objectid = start;
715         key.offset = len;
716         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
717         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
718                                 0, 0);
719         btrfs_free_path(path);
720         return ret;
721 }
722
723 /*
724  * helper function to lookup reference count and flags of extent.
725  *
726  * the head node for delayed ref is used to store the sum of all the
727  * reference count modifications queued up in the rbtree. the head
728  * node may also store the extent flags to set. This way you can check
729  * to see what the reference count and extent flags would be if all of
730  * the delayed refs are not processed.
731  */
732 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
733                              struct btrfs_root *root, u64 bytenr,
734                              u64 num_bytes, u64 *refs, u64 *flags)
735 {
736         struct btrfs_delayed_ref_head *head;
737         struct btrfs_delayed_ref_root *delayed_refs;
738         struct btrfs_path *path;
739         struct btrfs_extent_item *ei;
740         struct extent_buffer *leaf;
741         struct btrfs_key key;
742         u32 item_size;
743         u64 num_refs;
744         u64 extent_flags;
745         int ret;
746
747         path = btrfs_alloc_path();
748         if (!path)
749                 return -ENOMEM;
750
751         key.objectid = bytenr;
752         key.type = BTRFS_EXTENT_ITEM_KEY;
753         key.offset = num_bytes;
754         if (!trans) {
755                 path->skip_locking = 1;
756                 path->search_commit_root = 1;
757         }
758 again:
759         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
760                                 &key, path, 0, 0);
761         if (ret < 0)
762                 goto out_free;
763
764         if (ret == 0) {
765                 leaf = path->nodes[0];
766                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
767                 if (item_size >= sizeof(*ei)) {
768                         ei = btrfs_item_ptr(leaf, path->slots[0],
769                                             struct btrfs_extent_item);
770                         num_refs = btrfs_extent_refs(leaf, ei);
771                         extent_flags = btrfs_extent_flags(leaf, ei);
772                 } else {
773 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
774                         struct btrfs_extent_item_v0 *ei0;
775                         BUG_ON(item_size != sizeof(*ei0));
776                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
777                                              struct btrfs_extent_item_v0);
778                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
779                         /* FIXME: this isn't correct for data */
780                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
781 #else
782                         BUG();
783 #endif
784                 }
785                 BUG_ON(num_refs == 0);
786         } else {
787                 num_refs = 0;
788                 extent_flags = 0;
789                 ret = 0;
790         }
791
792         if (!trans)
793                 goto out;
794
795         delayed_refs = &trans->transaction->delayed_refs;
796         spin_lock(&delayed_refs->lock);
797         head = btrfs_find_delayed_ref_head(trans, bytenr);
798         if (head) {
799                 if (!mutex_trylock(&head->mutex)) {
800                         atomic_inc(&head->node.refs);
801                         spin_unlock(&delayed_refs->lock);
802
803                         btrfs_release_path(path);
804
805                         /*
806                          * Mutex was contended, block until it's released and try
807                          * again
808                          */
809                         mutex_lock(&head->mutex);
810                         mutex_unlock(&head->mutex);
811                         btrfs_put_delayed_ref(&head->node);
812                         goto again;
813                 }
814                 if (head->extent_op && head->extent_op->update_flags)
815                         extent_flags |= head->extent_op->flags_to_set;
816                 else
817                         BUG_ON(num_refs == 0);
818
819                 num_refs += head->node.ref_mod;
820                 mutex_unlock(&head->mutex);
821         }
822         spin_unlock(&delayed_refs->lock);
823 out:
824         WARN_ON(num_refs == 0);
825         if (refs)
826                 *refs = num_refs;
827         if (flags)
828                 *flags = extent_flags;
829 out_free:
830         btrfs_free_path(path);
831         return ret;
832 }
833
834 /*
835  * Back reference rules.  Back refs have three main goals:
836  *
837  * 1) differentiate between all holders of references to an extent so that
838  *    when a reference is dropped we can make sure it was a valid reference
839  *    before freeing the extent.
840  *
841  * 2) Provide enough information to quickly find the holders of an extent
842  *    if we notice a given block is corrupted or bad.
843  *
844  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
845  *    maintenance.  This is actually the same as #2, but with a slightly
846  *    different use case.
847  *
848  * There are two kinds of back refs. The implicit back refs is optimized
849  * for pointers in non-shared tree blocks. For a given pointer in a block,
850  * back refs of this kind provide information about the block's owner tree
851  * and the pointer's key. These information allow us to find the block by
852  * b-tree searching. The full back refs is for pointers in tree blocks not
853  * referenced by their owner trees. The location of tree block is recorded
854  * in the back refs. Actually the full back refs is generic, and can be
855  * used in all cases the implicit back refs is used. The major shortcoming
856  * of the full back refs is its overhead. Every time a tree block gets
857  * COWed, we have to update back refs entry for all pointers in it.
858  *
859  * For a newly allocated tree block, we use implicit back refs for
860  * pointers in it. This means most tree related operations only involve
861  * implicit back refs. For a tree block created in old transaction, the
862  * only way to drop a reference to it is COW it. So we can detect the
863  * event that tree block loses its owner tree's reference and do the
864  * back refs conversion.
865  *
866  * When a tree block is COW'd through a tree, there are four cases:
867  *
868  * The reference count of the block is one and the tree is the block's
869  * owner tree. Nothing to do in this case.
870  *
871  * The reference count of the block is one and the tree is not the
872  * block's owner tree. In this case, full back refs is used for pointers
873  * in the block. Remove these full back refs, add implicit back refs for
874  * every pointers in the new block.
875  *
876  * The reference count of the block is greater than one and the tree is
877  * the block's owner tree. In this case, implicit back refs is used for
878  * pointers in the block. Add full back refs for every pointers in the
879  * block, increase lower level extents' reference counts. The original
880  * implicit back refs are entailed to the new block.
881  *
882  * The reference count of the block is greater than one and the tree is
883  * not the block's owner tree. Add implicit back refs for every pointer in
884  * the new block, increase lower level extents' reference count.
885  *
886  * Back Reference Key composing:
887  *
888  * The key objectid corresponds to the first byte in the extent,
889  * The key type is used to differentiate between types of back refs.
890  * There are different meanings of the key offset for different types
891  * of back refs.
892  *
893  * File extents can be referenced by:
894  *
895  * - multiple snapshots, subvolumes, or different generations in one subvol
896  * - different files inside a single subvolume
897  * - different offsets inside a file (bookend extents in file.c)
898  *
899  * The extent ref structure for the implicit back refs has fields for:
900  *
901  * - Objectid of the subvolume root
902  * - objectid of the file holding the reference
903  * - original offset in the file
904  * - how many bookend extents
905  *
906  * The key offset for the implicit back refs is hash of the first
907  * three fields.
908  *
909  * The extent ref structure for the full back refs has field for:
910  *
911  * - number of pointers in the tree leaf
912  *
913  * The key offset for the implicit back refs is the first byte of
914  * the tree leaf
915  *
916  * When a file extent is allocated, The implicit back refs is used.
917  * the fields are filled in:
918  *
919  *     (root_key.objectid, inode objectid, offset in file, 1)
920  *
921  * When a file extent is removed file truncation, we find the
922  * corresponding implicit back refs and check the following fields:
923  *
924  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
925  *
926  * Btree extents can be referenced by:
927  *
928  * - Different subvolumes
929  *
930  * Both the implicit back refs and the full back refs for tree blocks
931  * only consist of key. The key offset for the implicit back refs is
932  * objectid of block's owner tree. The key offset for the full back refs
933  * is the first byte of parent block.
934  *
935  * When implicit back refs is used, information about the lowest key and
936  * level of the tree block are required. These information are stored in
937  * tree block info structure.
938  */
939
940 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
941 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
942                                   struct btrfs_root *root,
943                                   struct btrfs_path *path,
944                                   u64 owner, u32 extra_size)
945 {
946         struct btrfs_extent_item *item;
947         struct btrfs_extent_item_v0 *ei0;
948         struct btrfs_extent_ref_v0 *ref0;
949         struct btrfs_tree_block_info *bi;
950         struct extent_buffer *leaf;
951         struct btrfs_key key;
952         struct btrfs_key found_key;
953         u32 new_size = sizeof(*item);
954         u64 refs;
955         int ret;
956
957         leaf = path->nodes[0];
958         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
959
960         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
961         ei0 = btrfs_item_ptr(leaf, path->slots[0],
962                              struct btrfs_extent_item_v0);
963         refs = btrfs_extent_refs_v0(leaf, ei0);
964
965         if (owner == (u64)-1) {
966                 while (1) {
967                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
968                                 ret = btrfs_next_leaf(root, path);
969                                 if (ret < 0)
970                                         return ret;
971                                 BUG_ON(ret > 0); /* Corruption */
972                                 leaf = path->nodes[0];
973                         }
974                         btrfs_item_key_to_cpu(leaf, &found_key,
975                                               path->slots[0]);
976                         BUG_ON(key.objectid != found_key.objectid);
977                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
978                                 path->slots[0]++;
979                                 continue;
980                         }
981                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
982                                               struct btrfs_extent_ref_v0);
983                         owner = btrfs_ref_objectid_v0(leaf, ref0);
984                         break;
985                 }
986         }
987         btrfs_release_path(path);
988
989         if (owner < BTRFS_FIRST_FREE_OBJECTID)
990                 new_size += sizeof(*bi);
991
992         new_size -= sizeof(*ei0);
993         ret = btrfs_search_slot(trans, root, &key, path,
994                                 new_size + extra_size, 1);
995         if (ret < 0)
996                 return ret;
997         BUG_ON(ret); /* Corruption */
998
999         btrfs_extend_item(trans, root, path, new_size);
1000
1001         leaf = path->nodes[0];
1002         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1003         btrfs_set_extent_refs(leaf, item, refs);
1004         /* FIXME: get real generation */
1005         btrfs_set_extent_generation(leaf, item, 0);
1006         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1007                 btrfs_set_extent_flags(leaf, item,
1008                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1009                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1010                 bi = (struct btrfs_tree_block_info *)(item + 1);
1011                 /* FIXME: get first key of the block */
1012                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1013                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1014         } else {
1015                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1016         }
1017         btrfs_mark_buffer_dirty(leaf);
1018         return 0;
1019 }
1020 #endif
1021
1022 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1023 {
1024         u32 high_crc = ~(u32)0;
1025         u32 low_crc = ~(u32)0;
1026         __le64 lenum;
1027
1028         lenum = cpu_to_le64(root_objectid);
1029         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1030         lenum = cpu_to_le64(owner);
1031         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1032         lenum = cpu_to_le64(offset);
1033         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1034
1035         return ((u64)high_crc << 31) ^ (u64)low_crc;
1036 }
1037
1038 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1039                                      struct btrfs_extent_data_ref *ref)
1040 {
1041         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1042                                     btrfs_extent_data_ref_objectid(leaf, ref),
1043                                     btrfs_extent_data_ref_offset(leaf, ref));
1044 }
1045
1046 static int match_extent_data_ref(struct extent_buffer *leaf,
1047                                  struct btrfs_extent_data_ref *ref,
1048                                  u64 root_objectid, u64 owner, u64 offset)
1049 {
1050         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1051             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1052             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1053                 return 0;
1054         return 1;
1055 }
1056
1057 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1058                                            struct btrfs_root *root,
1059                                            struct btrfs_path *path,
1060                                            u64 bytenr, u64 parent,
1061                                            u64 root_objectid,
1062                                            u64 owner, u64 offset)
1063 {
1064         struct btrfs_key key;
1065         struct btrfs_extent_data_ref *ref;
1066         struct extent_buffer *leaf;
1067         u32 nritems;
1068         int ret;
1069         int recow;
1070         int err = -ENOENT;
1071
1072         key.objectid = bytenr;
1073         if (parent) {
1074                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1075                 key.offset = parent;
1076         } else {
1077                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1078                 key.offset = hash_extent_data_ref(root_objectid,
1079                                                   owner, offset);
1080         }
1081 again:
1082         recow = 0;
1083         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1084         if (ret < 0) {
1085                 err = ret;
1086                 goto fail;
1087         }
1088
1089         if (parent) {
1090                 if (!ret)
1091                         return 0;
1092 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1093                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1094                 btrfs_release_path(path);
1095                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1096                 if (ret < 0) {
1097                         err = ret;
1098                         goto fail;
1099                 }
1100                 if (!ret)
1101                         return 0;
1102 #endif
1103                 goto fail;
1104         }
1105
1106         leaf = path->nodes[0];
1107         nritems = btrfs_header_nritems(leaf);
1108         while (1) {
1109                 if (path->slots[0] >= nritems) {
1110                         ret = btrfs_next_leaf(root, path);
1111                         if (ret < 0)
1112                                 err = ret;
1113                         if (ret)
1114                                 goto fail;
1115
1116                         leaf = path->nodes[0];
1117                         nritems = btrfs_header_nritems(leaf);
1118                         recow = 1;
1119                 }
1120
1121                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1122                 if (key.objectid != bytenr ||
1123                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1124                         goto fail;
1125
1126                 ref = btrfs_item_ptr(leaf, path->slots[0],
1127                                      struct btrfs_extent_data_ref);
1128
1129                 if (match_extent_data_ref(leaf, ref, root_objectid,
1130                                           owner, offset)) {
1131                         if (recow) {
1132                                 btrfs_release_path(path);
1133                                 goto again;
1134                         }
1135                         err = 0;
1136                         break;
1137                 }
1138                 path->slots[0]++;
1139         }
1140 fail:
1141         return err;
1142 }
1143
1144 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1145                                            struct btrfs_root *root,
1146                                            struct btrfs_path *path,
1147                                            u64 bytenr, u64 parent,
1148                                            u64 root_objectid, u64 owner,
1149                                            u64 offset, int refs_to_add)
1150 {
1151         struct btrfs_key key;
1152         struct extent_buffer *leaf;
1153         u32 size;
1154         u32 num_refs;
1155         int ret;
1156
1157         key.objectid = bytenr;
1158         if (parent) {
1159                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1160                 key.offset = parent;
1161                 size = sizeof(struct btrfs_shared_data_ref);
1162         } else {
1163                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1164                 key.offset = hash_extent_data_ref(root_objectid,
1165                                                   owner, offset);
1166                 size = sizeof(struct btrfs_extent_data_ref);
1167         }
1168
1169         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1170         if (ret && ret != -EEXIST)
1171                 goto fail;
1172
1173         leaf = path->nodes[0];
1174         if (parent) {
1175                 struct btrfs_shared_data_ref *ref;
1176                 ref = btrfs_item_ptr(leaf, path->slots[0],
1177                                      struct btrfs_shared_data_ref);
1178                 if (ret == 0) {
1179                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1180                 } else {
1181                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1182                         num_refs += refs_to_add;
1183                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1184                 }
1185         } else {
1186                 struct btrfs_extent_data_ref *ref;
1187                 while (ret == -EEXIST) {
1188                         ref = btrfs_item_ptr(leaf, path->slots[0],
1189                                              struct btrfs_extent_data_ref);
1190                         if (match_extent_data_ref(leaf, ref, root_objectid,
1191                                                   owner, offset))
1192                                 break;
1193                         btrfs_release_path(path);
1194                         key.offset++;
1195                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1196                                                       size);
1197                         if (ret && ret != -EEXIST)
1198                                 goto fail;
1199
1200                         leaf = path->nodes[0];
1201                 }
1202                 ref = btrfs_item_ptr(leaf, path->slots[0],
1203                                      struct btrfs_extent_data_ref);
1204                 if (ret == 0) {
1205                         btrfs_set_extent_data_ref_root(leaf, ref,
1206                                                        root_objectid);
1207                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1208                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1209                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1210                 } else {
1211                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1212                         num_refs += refs_to_add;
1213                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1214                 }
1215         }
1216         btrfs_mark_buffer_dirty(leaf);
1217         ret = 0;
1218 fail:
1219         btrfs_release_path(path);
1220         return ret;
1221 }
1222
1223 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1224                                            struct btrfs_root *root,
1225                                            struct btrfs_path *path,
1226                                            int refs_to_drop)
1227 {
1228         struct btrfs_key key;
1229         struct btrfs_extent_data_ref *ref1 = NULL;
1230         struct btrfs_shared_data_ref *ref2 = NULL;
1231         struct extent_buffer *leaf;
1232         u32 num_refs = 0;
1233         int ret = 0;
1234
1235         leaf = path->nodes[0];
1236         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1237
1238         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1239                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1240                                       struct btrfs_extent_data_ref);
1241                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1242         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1243                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1244                                       struct btrfs_shared_data_ref);
1245                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1246 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1247         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1248                 struct btrfs_extent_ref_v0 *ref0;
1249                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1250                                       struct btrfs_extent_ref_v0);
1251                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1252 #endif
1253         } else {
1254                 BUG();
1255         }
1256
1257         BUG_ON(num_refs < refs_to_drop);
1258         num_refs -= refs_to_drop;
1259
1260         if (num_refs == 0) {
1261                 ret = btrfs_del_item(trans, root, path);
1262         } else {
1263                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1264                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1265                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1266                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1267 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1268                 else {
1269                         struct btrfs_extent_ref_v0 *ref0;
1270                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1271                                         struct btrfs_extent_ref_v0);
1272                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1273                 }
1274 #endif
1275                 btrfs_mark_buffer_dirty(leaf);
1276         }
1277         return ret;
1278 }
1279
1280 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1281                                           struct btrfs_path *path,
1282                                           struct btrfs_extent_inline_ref *iref)
1283 {
1284         struct btrfs_key key;
1285         struct extent_buffer *leaf;
1286         struct btrfs_extent_data_ref *ref1;
1287         struct btrfs_shared_data_ref *ref2;
1288         u32 num_refs = 0;
1289
1290         leaf = path->nodes[0];
1291         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1292         if (iref) {
1293                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1294                     BTRFS_EXTENT_DATA_REF_KEY) {
1295                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1296                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1297                 } else {
1298                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1299                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1300                 }
1301         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1302                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1303                                       struct btrfs_extent_data_ref);
1304                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1305         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1306                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1307                                       struct btrfs_shared_data_ref);
1308                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1309 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1310         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1311                 struct btrfs_extent_ref_v0 *ref0;
1312                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1313                                       struct btrfs_extent_ref_v0);
1314                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1315 #endif
1316         } else {
1317                 WARN_ON(1);
1318         }
1319         return num_refs;
1320 }
1321
1322 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1323                                           struct btrfs_root *root,
1324                                           struct btrfs_path *path,
1325                                           u64 bytenr, u64 parent,
1326                                           u64 root_objectid)
1327 {
1328         struct btrfs_key key;
1329         int ret;
1330
1331         key.objectid = bytenr;
1332         if (parent) {
1333                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1334                 key.offset = parent;
1335         } else {
1336                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1337                 key.offset = root_objectid;
1338         }
1339
1340         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1341         if (ret > 0)
1342                 ret = -ENOENT;
1343 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1344         if (ret == -ENOENT && parent) {
1345                 btrfs_release_path(path);
1346                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1347                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1348                 if (ret > 0)
1349                         ret = -ENOENT;
1350         }
1351 #endif
1352         return ret;
1353 }
1354
1355 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1356                                           struct btrfs_root *root,
1357                                           struct btrfs_path *path,
1358                                           u64 bytenr, u64 parent,
1359                                           u64 root_objectid)
1360 {
1361         struct btrfs_key key;
1362         int ret;
1363
1364         key.objectid = bytenr;
1365         if (parent) {
1366                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1367                 key.offset = parent;
1368         } else {
1369                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1370                 key.offset = root_objectid;
1371         }
1372
1373         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1374         btrfs_release_path(path);
1375         return ret;
1376 }
1377
1378 static inline int extent_ref_type(u64 parent, u64 owner)
1379 {
1380         int type;
1381         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1382                 if (parent > 0)
1383                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1384                 else
1385                         type = BTRFS_TREE_BLOCK_REF_KEY;
1386         } else {
1387                 if (parent > 0)
1388                         type = BTRFS_SHARED_DATA_REF_KEY;
1389                 else
1390                         type = BTRFS_EXTENT_DATA_REF_KEY;
1391         }
1392         return type;
1393 }
1394
1395 static int find_next_key(struct btrfs_path *path, int level,
1396                          struct btrfs_key *key)
1397
1398 {
1399         for (; level < BTRFS_MAX_LEVEL; level++) {
1400                 if (!path->nodes[level])
1401                         break;
1402                 if (path->slots[level] + 1 >=
1403                     btrfs_header_nritems(path->nodes[level]))
1404                         continue;
1405                 if (level == 0)
1406                         btrfs_item_key_to_cpu(path->nodes[level], key,
1407                                               path->slots[level] + 1);
1408                 else
1409                         btrfs_node_key_to_cpu(path->nodes[level], key,
1410                                               path->slots[level] + 1);
1411                 return 0;
1412         }
1413         return 1;
1414 }
1415
1416 /*
1417  * look for inline back ref. if back ref is found, *ref_ret is set
1418  * to the address of inline back ref, and 0 is returned.
1419  *
1420  * if back ref isn't found, *ref_ret is set to the address where it
1421  * should be inserted, and -ENOENT is returned.
1422  *
1423  * if insert is true and there are too many inline back refs, the path
1424  * points to the extent item, and -EAGAIN is returned.
1425  *
1426  * NOTE: inline back refs are ordered in the same way that back ref
1427  *       items in the tree are ordered.
1428  */
1429 static noinline_for_stack
1430 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1431                                  struct btrfs_root *root,
1432                                  struct btrfs_path *path,
1433                                  struct btrfs_extent_inline_ref **ref_ret,
1434                                  u64 bytenr, u64 num_bytes,
1435                                  u64 parent, u64 root_objectid,
1436                                  u64 owner, u64 offset, int insert)
1437 {
1438         struct btrfs_key key;
1439         struct extent_buffer *leaf;
1440         struct btrfs_extent_item *ei;
1441         struct btrfs_extent_inline_ref *iref;
1442         u64 flags;
1443         u64 item_size;
1444         unsigned long ptr;
1445         unsigned long end;
1446         int extra_size;
1447         int type;
1448         int want;
1449         int ret;
1450         int err = 0;
1451
1452         key.objectid = bytenr;
1453         key.type = BTRFS_EXTENT_ITEM_KEY;
1454         key.offset = num_bytes;
1455
1456         want = extent_ref_type(parent, owner);
1457         if (insert) {
1458                 extra_size = btrfs_extent_inline_ref_size(want);
1459                 path->keep_locks = 1;
1460         } else
1461                 extra_size = -1;
1462         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1463         if (ret < 0) {
1464                 err = ret;
1465                 goto out;
1466         }
1467         if (ret && !insert) {
1468                 err = -ENOENT;
1469                 goto out;
1470         } else if (ret) {
1471                 err = -EIO;
1472                 WARN_ON(1);
1473                 goto out;
1474         }
1475
1476         leaf = path->nodes[0];
1477         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1478 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1479         if (item_size < sizeof(*ei)) {
1480                 if (!insert) {
1481                         err = -ENOENT;
1482                         goto out;
1483                 }
1484                 ret = convert_extent_item_v0(trans, root, path, owner,
1485                                              extra_size);
1486                 if (ret < 0) {
1487                         err = ret;
1488                         goto out;
1489                 }
1490                 leaf = path->nodes[0];
1491                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1492         }
1493 #endif
1494         BUG_ON(item_size < sizeof(*ei));
1495
1496         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1497         flags = btrfs_extent_flags(leaf, ei);
1498
1499         ptr = (unsigned long)(ei + 1);
1500         end = (unsigned long)ei + item_size;
1501
1502         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1503                 ptr += sizeof(struct btrfs_tree_block_info);
1504                 BUG_ON(ptr > end);
1505         } else {
1506                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1507         }
1508
1509         err = -ENOENT;
1510         while (1) {
1511                 if (ptr >= end) {
1512                         WARN_ON(ptr > end);
1513                         break;
1514                 }
1515                 iref = (struct btrfs_extent_inline_ref *)ptr;
1516                 type = btrfs_extent_inline_ref_type(leaf, iref);
1517                 if (want < type)
1518                         break;
1519                 if (want > type) {
1520                         ptr += btrfs_extent_inline_ref_size(type);
1521                         continue;
1522                 }
1523
1524                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1525                         struct btrfs_extent_data_ref *dref;
1526                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1527                         if (match_extent_data_ref(leaf, dref, root_objectid,
1528                                                   owner, offset)) {
1529                                 err = 0;
1530                                 break;
1531                         }
1532                         if (hash_extent_data_ref_item(leaf, dref) <
1533                             hash_extent_data_ref(root_objectid, owner, offset))
1534                                 break;
1535                 } else {
1536                         u64 ref_offset;
1537                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1538                         if (parent > 0) {
1539                                 if (parent == ref_offset) {
1540                                         err = 0;
1541                                         break;
1542                                 }
1543                                 if (ref_offset < parent)
1544                                         break;
1545                         } else {
1546                                 if (root_objectid == ref_offset) {
1547                                         err = 0;
1548                                         break;
1549                                 }
1550                                 if (ref_offset < root_objectid)
1551                                         break;
1552                         }
1553                 }
1554                 ptr += btrfs_extent_inline_ref_size(type);
1555         }
1556         if (err == -ENOENT && insert) {
1557                 if (item_size + extra_size >=
1558                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1559                         err = -EAGAIN;
1560                         goto out;
1561                 }
1562                 /*
1563                  * To add new inline back ref, we have to make sure
1564                  * there is no corresponding back ref item.
1565                  * For simplicity, we just do not add new inline back
1566                  * ref if there is any kind of item for this block
1567                  */
1568                 if (find_next_key(path, 0, &key) == 0 &&
1569                     key.objectid == bytenr &&
1570                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1571                         err = -EAGAIN;
1572                         goto out;
1573                 }
1574         }
1575         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1576 out:
1577         if (insert) {
1578                 path->keep_locks = 0;
1579                 btrfs_unlock_up_safe(path, 1);
1580         }
1581         return err;
1582 }
1583
1584 /*
1585  * helper to add new inline back ref
1586  */
1587 static noinline_for_stack
1588 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1589                                  struct btrfs_root *root,
1590                                  struct btrfs_path *path,
1591                                  struct btrfs_extent_inline_ref *iref,
1592                                  u64 parent, u64 root_objectid,
1593                                  u64 owner, u64 offset, int refs_to_add,
1594                                  struct btrfs_delayed_extent_op *extent_op)
1595 {
1596         struct extent_buffer *leaf;
1597         struct btrfs_extent_item *ei;
1598         unsigned long ptr;
1599         unsigned long end;
1600         unsigned long item_offset;
1601         u64 refs;
1602         int size;
1603         int type;
1604
1605         leaf = path->nodes[0];
1606         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1607         item_offset = (unsigned long)iref - (unsigned long)ei;
1608
1609         type = extent_ref_type(parent, owner);
1610         size = btrfs_extent_inline_ref_size(type);
1611
1612         btrfs_extend_item(trans, root, path, size);
1613
1614         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1615         refs = btrfs_extent_refs(leaf, ei);
1616         refs += refs_to_add;
1617         btrfs_set_extent_refs(leaf, ei, refs);
1618         if (extent_op)
1619                 __run_delayed_extent_op(extent_op, leaf, ei);
1620
1621         ptr = (unsigned long)ei + item_offset;
1622         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1623         if (ptr < end - size)
1624                 memmove_extent_buffer(leaf, ptr + size, ptr,
1625                                       end - size - ptr);
1626
1627         iref = (struct btrfs_extent_inline_ref *)ptr;
1628         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1629         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1630                 struct btrfs_extent_data_ref *dref;
1631                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1632                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1633                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1634                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1635                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1636         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1637                 struct btrfs_shared_data_ref *sref;
1638                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1639                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1640                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1641         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1642                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1643         } else {
1644                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1645         }
1646         btrfs_mark_buffer_dirty(leaf);
1647 }
1648
1649 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1650                                  struct btrfs_root *root,
1651                                  struct btrfs_path *path,
1652                                  struct btrfs_extent_inline_ref **ref_ret,
1653                                  u64 bytenr, u64 num_bytes, u64 parent,
1654                                  u64 root_objectid, u64 owner, u64 offset)
1655 {
1656         int ret;
1657
1658         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1659                                            bytenr, num_bytes, parent,
1660                                            root_objectid, owner, offset, 0);
1661         if (ret != -ENOENT)
1662                 return ret;
1663
1664         btrfs_release_path(path);
1665         *ref_ret = NULL;
1666
1667         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1668                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1669                                             root_objectid);
1670         } else {
1671                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1672                                              root_objectid, owner, offset);
1673         }
1674         return ret;
1675 }
1676
1677 /*
1678  * helper to update/remove inline back ref
1679  */
1680 static noinline_for_stack
1681 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1682                                   struct btrfs_root *root,
1683                                   struct btrfs_path *path,
1684                                   struct btrfs_extent_inline_ref *iref,
1685                                   int refs_to_mod,
1686                                   struct btrfs_delayed_extent_op *extent_op)
1687 {
1688         struct extent_buffer *leaf;
1689         struct btrfs_extent_item *ei;
1690         struct btrfs_extent_data_ref *dref = NULL;
1691         struct btrfs_shared_data_ref *sref = NULL;
1692         unsigned long ptr;
1693         unsigned long end;
1694         u32 item_size;
1695         int size;
1696         int type;
1697         u64 refs;
1698
1699         leaf = path->nodes[0];
1700         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1701         refs = btrfs_extent_refs(leaf, ei);
1702         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1703         refs += refs_to_mod;
1704         btrfs_set_extent_refs(leaf, ei, refs);
1705         if (extent_op)
1706                 __run_delayed_extent_op(extent_op, leaf, ei);
1707
1708         type = btrfs_extent_inline_ref_type(leaf, iref);
1709
1710         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1711                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1712                 refs = btrfs_extent_data_ref_count(leaf, dref);
1713         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1714                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1715                 refs = btrfs_shared_data_ref_count(leaf, sref);
1716         } else {
1717                 refs = 1;
1718                 BUG_ON(refs_to_mod != -1);
1719         }
1720
1721         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1722         refs += refs_to_mod;
1723
1724         if (refs > 0) {
1725                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1726                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1727                 else
1728                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1729         } else {
1730                 size =  btrfs_extent_inline_ref_size(type);
1731                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1732                 ptr = (unsigned long)iref;
1733                 end = (unsigned long)ei + item_size;
1734                 if (ptr + size < end)
1735                         memmove_extent_buffer(leaf, ptr, ptr + size,
1736                                               end - ptr - size);
1737                 item_size -= size;
1738                 btrfs_truncate_item(trans, root, path, item_size, 1);
1739         }
1740         btrfs_mark_buffer_dirty(leaf);
1741 }
1742
1743 static noinline_for_stack
1744 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1745                                  struct btrfs_root *root,
1746                                  struct btrfs_path *path,
1747                                  u64 bytenr, u64 num_bytes, u64 parent,
1748                                  u64 root_objectid, u64 owner,
1749                                  u64 offset, int refs_to_add,
1750                                  struct btrfs_delayed_extent_op *extent_op)
1751 {
1752         struct btrfs_extent_inline_ref *iref;
1753         int ret;
1754
1755         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1756                                            bytenr, num_bytes, parent,
1757                                            root_objectid, owner, offset, 1);
1758         if (ret == 0) {
1759                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1760                 update_inline_extent_backref(trans, root, path, iref,
1761                                              refs_to_add, extent_op);
1762         } else if (ret == -ENOENT) {
1763                 setup_inline_extent_backref(trans, root, path, iref, parent,
1764                                             root_objectid, owner, offset,
1765                                             refs_to_add, extent_op);
1766                 ret = 0;
1767         }
1768         return ret;
1769 }
1770
1771 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1772                                  struct btrfs_root *root,
1773                                  struct btrfs_path *path,
1774                                  u64 bytenr, u64 parent, u64 root_objectid,
1775                                  u64 owner, u64 offset, int refs_to_add)
1776 {
1777         int ret;
1778         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1779                 BUG_ON(refs_to_add != 1);
1780                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1781                                             parent, root_objectid);
1782         } else {
1783                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1784                                              parent, root_objectid,
1785                                              owner, offset, refs_to_add);
1786         }
1787         return ret;
1788 }
1789
1790 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1791                                  struct btrfs_root *root,
1792                                  struct btrfs_path *path,
1793                                  struct btrfs_extent_inline_ref *iref,
1794                                  int refs_to_drop, int is_data)
1795 {
1796         int ret = 0;
1797
1798         BUG_ON(!is_data && refs_to_drop != 1);
1799         if (iref) {
1800                 update_inline_extent_backref(trans, root, path, iref,
1801                                              -refs_to_drop, NULL);
1802         } else if (is_data) {
1803                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1804         } else {
1805                 ret = btrfs_del_item(trans, root, path);
1806         }
1807         return ret;
1808 }
1809
1810 static int btrfs_issue_discard(struct block_device *bdev,
1811                                 u64 start, u64 len)
1812 {
1813         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1814 }
1815
1816 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1817                                 u64 num_bytes, u64 *actual_bytes)
1818 {
1819         int ret;
1820         u64 discarded_bytes = 0;
1821         struct btrfs_bio *bbio = NULL;
1822
1823
1824         /* Tell the block device(s) that the sectors can be discarded */
1825         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1826                               bytenr, &num_bytes, &bbio, 0);
1827         /* Error condition is -ENOMEM */
1828         if (!ret) {
1829                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1830                 int i;
1831
1832
1833                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1834                         if (!stripe->dev->can_discard)
1835                                 continue;
1836
1837                         ret = btrfs_issue_discard(stripe->dev->bdev,
1838                                                   stripe->physical,
1839                                                   stripe->length);
1840                         if (!ret)
1841                                 discarded_bytes += stripe->length;
1842                         else if (ret != -EOPNOTSUPP)
1843                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1844
1845                         /*
1846                          * Just in case we get back EOPNOTSUPP for some reason,
1847                          * just ignore the return value so we don't screw up
1848                          * people calling discard_extent.
1849                          */
1850                         ret = 0;
1851                 }
1852                 kfree(bbio);
1853         }
1854
1855         if (actual_bytes)
1856                 *actual_bytes = discarded_bytes;
1857
1858
1859         if (ret == -EOPNOTSUPP)
1860                 ret = 0;
1861         return ret;
1862 }
1863
1864 /* Can return -ENOMEM */
1865 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1866                          struct btrfs_root *root,
1867                          u64 bytenr, u64 num_bytes, u64 parent,
1868                          u64 root_objectid, u64 owner, u64 offset, int for_cow)
1869 {
1870         int ret;
1871         struct btrfs_fs_info *fs_info = root->fs_info;
1872
1873         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1874                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1875
1876         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1877                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1878                                         num_bytes,
1879                                         parent, root_objectid, (int)owner,
1880                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1881         } else {
1882                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1883                                         num_bytes,
1884                                         parent, root_objectid, owner, offset,
1885                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1886         }
1887         return ret;
1888 }
1889
1890 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1891                                   struct btrfs_root *root,
1892                                   u64 bytenr, u64 num_bytes,
1893                                   u64 parent, u64 root_objectid,
1894                                   u64 owner, u64 offset, int refs_to_add,
1895                                   struct btrfs_delayed_extent_op *extent_op)
1896 {
1897         struct btrfs_path *path;
1898         struct extent_buffer *leaf;
1899         struct btrfs_extent_item *item;
1900         u64 refs;
1901         int ret;
1902         int err = 0;
1903
1904         path = btrfs_alloc_path();
1905         if (!path)
1906                 return -ENOMEM;
1907
1908         path->reada = 1;
1909         path->leave_spinning = 1;
1910         /* this will setup the path even if it fails to insert the back ref */
1911         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1912                                            path, bytenr, num_bytes, parent,
1913                                            root_objectid, owner, offset,
1914                                            refs_to_add, extent_op);
1915         if (ret == 0)
1916                 goto out;
1917
1918         if (ret != -EAGAIN) {
1919                 err = ret;
1920                 goto out;
1921         }
1922
1923         leaf = path->nodes[0];
1924         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1925         refs = btrfs_extent_refs(leaf, item);
1926         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1927         if (extent_op)
1928                 __run_delayed_extent_op(extent_op, leaf, item);
1929
1930         btrfs_mark_buffer_dirty(leaf);
1931         btrfs_release_path(path);
1932
1933         path->reada = 1;
1934         path->leave_spinning = 1;
1935
1936         /* now insert the actual backref */
1937         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1938                                     path, bytenr, parent, root_objectid,
1939                                     owner, offset, refs_to_add);
1940         if (ret)
1941                 btrfs_abort_transaction(trans, root, ret);
1942 out:
1943         btrfs_free_path(path);
1944         return err;
1945 }
1946
1947 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1948                                 struct btrfs_root *root,
1949                                 struct btrfs_delayed_ref_node *node,
1950                                 struct btrfs_delayed_extent_op *extent_op,
1951                                 int insert_reserved)
1952 {
1953         int ret = 0;
1954         struct btrfs_delayed_data_ref *ref;
1955         struct btrfs_key ins;
1956         u64 parent = 0;
1957         u64 ref_root = 0;
1958         u64 flags = 0;
1959
1960         ins.objectid = node->bytenr;
1961         ins.offset = node->num_bytes;
1962         ins.type = BTRFS_EXTENT_ITEM_KEY;
1963
1964         ref = btrfs_delayed_node_to_data_ref(node);
1965         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1966                 parent = ref->parent;
1967         else
1968                 ref_root = ref->root;
1969
1970         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1971                 if (extent_op) {
1972                         BUG_ON(extent_op->update_key);
1973                         flags |= extent_op->flags_to_set;
1974                 }
1975                 ret = alloc_reserved_file_extent(trans, root,
1976                                                  parent, ref_root, flags,
1977                                                  ref->objectid, ref->offset,
1978                                                  &ins, node->ref_mod);
1979         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1980                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1981                                              node->num_bytes, parent,
1982                                              ref_root, ref->objectid,
1983                                              ref->offset, node->ref_mod,
1984                                              extent_op);
1985         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1986                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1987                                           node->num_bytes, parent,
1988                                           ref_root, ref->objectid,
1989                                           ref->offset, node->ref_mod,
1990                                           extent_op);
1991         } else {
1992                 BUG();
1993         }
1994         return ret;
1995 }
1996
1997 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1998                                     struct extent_buffer *leaf,
1999                                     struct btrfs_extent_item *ei)
2000 {
2001         u64 flags = btrfs_extent_flags(leaf, ei);
2002         if (extent_op->update_flags) {
2003                 flags |= extent_op->flags_to_set;
2004                 btrfs_set_extent_flags(leaf, ei, flags);
2005         }
2006
2007         if (extent_op->update_key) {
2008                 struct btrfs_tree_block_info *bi;
2009                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2010                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2011                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2012         }
2013 }
2014
2015 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2016                                  struct btrfs_root *root,
2017                                  struct btrfs_delayed_ref_node *node,
2018                                  struct btrfs_delayed_extent_op *extent_op)
2019 {
2020         struct btrfs_key key;
2021         struct btrfs_path *path;
2022         struct btrfs_extent_item *ei;
2023         struct extent_buffer *leaf;
2024         u32 item_size;
2025         int ret;
2026         int err = 0;
2027
2028         if (trans->aborted)
2029                 return 0;
2030
2031         path = btrfs_alloc_path();
2032         if (!path)
2033                 return -ENOMEM;
2034
2035         key.objectid = node->bytenr;
2036         key.type = BTRFS_EXTENT_ITEM_KEY;
2037         key.offset = node->num_bytes;
2038
2039         path->reada = 1;
2040         path->leave_spinning = 1;
2041         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2042                                 path, 0, 1);
2043         if (ret < 0) {
2044                 err = ret;
2045                 goto out;
2046         }
2047         if (ret > 0) {
2048                 err = -EIO;
2049                 goto out;
2050         }
2051
2052         leaf = path->nodes[0];
2053         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2054 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2055         if (item_size < sizeof(*ei)) {
2056                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2057                                              path, (u64)-1, 0);
2058                 if (ret < 0) {
2059                         err = ret;
2060                         goto out;
2061                 }
2062                 leaf = path->nodes[0];
2063                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2064         }
2065 #endif
2066         BUG_ON(item_size < sizeof(*ei));
2067         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2068         __run_delayed_extent_op(extent_op, leaf, ei);
2069
2070         btrfs_mark_buffer_dirty(leaf);
2071 out:
2072         btrfs_free_path(path);
2073         return err;
2074 }
2075
2076 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2077                                 struct btrfs_root *root,
2078                                 struct btrfs_delayed_ref_node *node,
2079                                 struct btrfs_delayed_extent_op *extent_op,
2080                                 int insert_reserved)
2081 {
2082         int ret = 0;
2083         struct btrfs_delayed_tree_ref *ref;
2084         struct btrfs_key ins;
2085         u64 parent = 0;
2086         u64 ref_root = 0;
2087
2088         ins.objectid = node->bytenr;
2089         ins.offset = node->num_bytes;
2090         ins.type = BTRFS_EXTENT_ITEM_KEY;
2091
2092         ref = btrfs_delayed_node_to_tree_ref(node);
2093         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2094                 parent = ref->parent;
2095         else
2096                 ref_root = ref->root;
2097
2098         BUG_ON(node->ref_mod != 1);
2099         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2100                 BUG_ON(!extent_op || !extent_op->update_flags ||
2101                        !extent_op->update_key);
2102                 ret = alloc_reserved_tree_block(trans, root,
2103                                                 parent, ref_root,
2104                                                 extent_op->flags_to_set,
2105                                                 &extent_op->key,
2106                                                 ref->level, &ins);
2107         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2108                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2109                                              node->num_bytes, parent, ref_root,
2110                                              ref->level, 0, 1, extent_op);
2111         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2112                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2113                                           node->num_bytes, parent, ref_root,
2114                                           ref->level, 0, 1, extent_op);
2115         } else {
2116                 BUG();
2117         }
2118         return ret;
2119 }
2120
2121 /* helper function to actually process a single delayed ref entry */
2122 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2123                                struct btrfs_root *root,
2124                                struct btrfs_delayed_ref_node *node,
2125                                struct btrfs_delayed_extent_op *extent_op,
2126                                int insert_reserved)
2127 {
2128         int ret = 0;
2129
2130         if (trans->aborted)
2131                 return 0;
2132
2133         if (btrfs_delayed_ref_is_head(node)) {
2134                 struct btrfs_delayed_ref_head *head;
2135                 /*
2136                  * we've hit the end of the chain and we were supposed
2137                  * to insert this extent into the tree.  But, it got
2138                  * deleted before we ever needed to insert it, so all
2139                  * we have to do is clean up the accounting
2140                  */
2141                 BUG_ON(extent_op);
2142                 head = btrfs_delayed_node_to_head(node);
2143                 if (insert_reserved) {
2144                         btrfs_pin_extent(root, node->bytenr,
2145                                          node->num_bytes, 1);
2146                         if (head->is_data) {
2147                                 ret = btrfs_del_csums(trans, root,
2148                                                       node->bytenr,
2149                                                       node->num_bytes);
2150                         }
2151                 }
2152                 return ret;
2153         }
2154
2155         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2156             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2157                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2158                                            insert_reserved);
2159         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2160                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2161                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2162                                            insert_reserved);
2163         else
2164                 BUG();
2165         return ret;
2166 }
2167
2168 static noinline struct btrfs_delayed_ref_node *
2169 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2170 {
2171         struct rb_node *node;
2172         struct btrfs_delayed_ref_node *ref;
2173         int action = BTRFS_ADD_DELAYED_REF;
2174 again:
2175         /*
2176          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2177          * this prevents ref count from going down to zero when
2178          * there still are pending delayed ref.
2179          */
2180         node = rb_prev(&head->node.rb_node);
2181         while (1) {
2182                 if (!node)
2183                         break;
2184                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2185                                 rb_node);
2186                 if (ref->bytenr != head->node.bytenr)
2187                         break;
2188                 if (ref->action == action)
2189                         return ref;
2190                 node = rb_prev(node);
2191         }
2192         if (action == BTRFS_ADD_DELAYED_REF) {
2193                 action = BTRFS_DROP_DELAYED_REF;
2194                 goto again;
2195         }
2196         return NULL;
2197 }
2198
2199 /*
2200  * Returns 0 on success or if called with an already aborted transaction.
2201  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2202  */
2203 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2204                                        struct btrfs_root *root,
2205                                        struct list_head *cluster)
2206 {
2207         struct btrfs_delayed_ref_root *delayed_refs;
2208         struct btrfs_delayed_ref_node *ref;
2209         struct btrfs_delayed_ref_head *locked_ref = NULL;
2210         struct btrfs_delayed_extent_op *extent_op;
2211         struct btrfs_fs_info *fs_info = root->fs_info;
2212         int ret;
2213         int count = 0;
2214         int must_insert_reserved = 0;
2215
2216         delayed_refs = &trans->transaction->delayed_refs;
2217         while (1) {
2218                 if (!locked_ref) {
2219                         /* pick a new head ref from the cluster list */
2220                         if (list_empty(cluster))
2221                                 break;
2222
2223                         locked_ref = list_entry(cluster->next,
2224                                      struct btrfs_delayed_ref_head, cluster);
2225
2226                         /* grab the lock that says we are going to process
2227                          * all the refs for this head */
2228                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2229
2230                         /*
2231                          * we may have dropped the spin lock to get the head
2232                          * mutex lock, and that might have given someone else
2233                          * time to free the head.  If that's true, it has been
2234                          * removed from our list and we can move on.
2235                          */
2236                         if (ret == -EAGAIN) {
2237                                 locked_ref = NULL;
2238                                 count++;
2239                                 continue;
2240                         }
2241                 }
2242
2243                 /*
2244                  * We need to try and merge add/drops of the same ref since we
2245                  * can run into issues with relocate dropping the implicit ref
2246                  * and then it being added back again before the drop can
2247                  * finish.  If we merged anything we need to re-loop so we can
2248                  * get a good ref.
2249                  */
2250                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2251                                          locked_ref);
2252
2253                 /*
2254                  * locked_ref is the head node, so we have to go one
2255                  * node back for any delayed ref updates
2256                  */
2257                 ref = select_delayed_ref(locked_ref);
2258
2259                 if (ref && ref->seq &&
2260                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2261                         /*
2262                          * there are still refs with lower seq numbers in the
2263                          * process of being added. Don't run this ref yet.
2264                          */
2265                         list_del_init(&locked_ref->cluster);
2266                         btrfs_delayed_ref_unlock(locked_ref);
2267                         locked_ref = NULL;
2268                         delayed_refs->num_heads_ready++;
2269                         spin_unlock(&delayed_refs->lock);
2270                         cond_resched();
2271                         spin_lock(&delayed_refs->lock);
2272                         continue;
2273                 }
2274
2275                 /*
2276                  * record the must insert reserved flag before we
2277                  * drop the spin lock.
2278                  */
2279                 must_insert_reserved = locked_ref->must_insert_reserved;
2280                 locked_ref->must_insert_reserved = 0;
2281
2282                 extent_op = locked_ref->extent_op;
2283                 locked_ref->extent_op = NULL;
2284
2285                 if (!ref) {
2286                         /* All delayed refs have been processed, Go ahead
2287                          * and send the head node to run_one_delayed_ref,
2288                          * so that any accounting fixes can happen
2289                          */
2290                         ref = &locked_ref->node;
2291
2292                         if (extent_op && must_insert_reserved) {
2293                                 btrfs_free_delayed_extent_op(extent_op);
2294                                 extent_op = NULL;
2295                         }
2296
2297                         if (extent_op) {
2298                                 spin_unlock(&delayed_refs->lock);
2299
2300                                 ret = run_delayed_extent_op(trans, root,
2301                                                             ref, extent_op);
2302                                 btrfs_free_delayed_extent_op(extent_op);
2303
2304                                 if (ret) {
2305                                         printk(KERN_DEBUG
2306                                                "btrfs: run_delayed_extent_op "
2307                                                "returned %d\n", ret);
2308                                         spin_lock(&delayed_refs->lock);
2309                                         btrfs_delayed_ref_unlock(locked_ref);
2310                                         return ret;
2311                                 }
2312
2313                                 goto next;
2314                         }
2315                 }
2316
2317                 ref->in_tree = 0;
2318                 rb_erase(&ref->rb_node, &delayed_refs->root);
2319                 delayed_refs->num_entries--;
2320                 if (!btrfs_delayed_ref_is_head(ref)) {
2321                         /*
2322                          * when we play the delayed ref, also correct the
2323                          * ref_mod on head
2324                          */
2325                         switch (ref->action) {
2326                         case BTRFS_ADD_DELAYED_REF:
2327                         case BTRFS_ADD_DELAYED_EXTENT:
2328                                 locked_ref->node.ref_mod -= ref->ref_mod;
2329                                 break;
2330                         case BTRFS_DROP_DELAYED_REF:
2331                                 locked_ref->node.ref_mod += ref->ref_mod;
2332                                 break;
2333                         default:
2334                                 WARN_ON(1);
2335                         }
2336                 }
2337                 spin_unlock(&delayed_refs->lock);
2338
2339                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2340                                           must_insert_reserved);
2341
2342                 btrfs_free_delayed_extent_op(extent_op);
2343                 if (ret) {
2344                         btrfs_delayed_ref_unlock(locked_ref);
2345                         btrfs_put_delayed_ref(ref);
2346                         printk(KERN_DEBUG
2347                                "btrfs: run_one_delayed_ref returned %d\n", ret);
2348                         spin_lock(&delayed_refs->lock);
2349                         return ret;
2350                 }
2351
2352                 /*
2353                  * If this node is a head, that means all the refs in this head
2354                  * have been dealt with, and we will pick the next head to deal
2355                  * with, so we must unlock the head and drop it from the cluster
2356                  * list before we release it.
2357                  */
2358                 if (btrfs_delayed_ref_is_head(ref)) {
2359                         list_del_init(&locked_ref->cluster);
2360                         btrfs_delayed_ref_unlock(locked_ref);
2361                         locked_ref = NULL;
2362                 }
2363                 btrfs_put_delayed_ref(ref);
2364                 count++;
2365 next:
2366                 cond_resched();
2367                 spin_lock(&delayed_refs->lock);
2368         }
2369         return count;
2370 }
2371
2372 #ifdef SCRAMBLE_DELAYED_REFS
2373 /*
2374  * Normally delayed refs get processed in ascending bytenr order. This
2375  * correlates in most cases to the order added. To expose dependencies on this
2376  * order, we start to process the tree in the middle instead of the beginning
2377  */
2378 static u64 find_middle(struct rb_root *root)
2379 {
2380         struct rb_node *n = root->rb_node;
2381         struct btrfs_delayed_ref_node *entry;
2382         int alt = 1;
2383         u64 middle;
2384         u64 first = 0, last = 0;
2385
2386         n = rb_first(root);
2387         if (n) {
2388                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2389                 first = entry->bytenr;
2390         }
2391         n = rb_last(root);
2392         if (n) {
2393                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2394                 last = entry->bytenr;
2395         }
2396         n = root->rb_node;
2397
2398         while (n) {
2399                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2400                 WARN_ON(!entry->in_tree);
2401
2402                 middle = entry->bytenr;
2403
2404                 if (alt)
2405                         n = n->rb_left;
2406                 else
2407                         n = n->rb_right;
2408
2409                 alt = 1 - alt;
2410         }
2411         return middle;
2412 }
2413 #endif
2414
2415 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2416                                          struct btrfs_fs_info *fs_info)
2417 {
2418         struct qgroup_update *qgroup_update;
2419         int ret = 0;
2420
2421         if (list_empty(&trans->qgroup_ref_list) !=
2422             !trans->delayed_ref_elem.seq) {
2423                 /* list without seq or seq without list */
2424                 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2425                         list_empty(&trans->qgroup_ref_list) ? "" : " not",
2426                         trans->delayed_ref_elem.seq);
2427                 BUG();
2428         }
2429
2430         if (!trans->delayed_ref_elem.seq)
2431                 return 0;
2432
2433         while (!list_empty(&trans->qgroup_ref_list)) {
2434                 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2435                                                  struct qgroup_update, list);
2436                 list_del(&qgroup_update->list);
2437                 if (!ret)
2438                         ret = btrfs_qgroup_account_ref(
2439                                         trans, fs_info, qgroup_update->node,
2440                                         qgroup_update->extent_op);
2441                 kfree(qgroup_update);
2442         }
2443
2444         btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2445
2446         return ret;
2447 }
2448
2449 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2450                       int count)
2451 {
2452         int val = atomic_read(&delayed_refs->ref_seq);
2453
2454         if (val < seq || val >= seq + count)
2455                 return 1;
2456         return 0;
2457 }
2458
2459 /*
2460  * this starts processing the delayed reference count updates and
2461  * extent insertions we have queued up so far.  count can be
2462  * 0, which means to process everything in the tree at the start
2463  * of the run (but not newly added entries), or it can be some target
2464  * number you'd like to process.
2465  *
2466  * Returns 0 on success or if called with an aborted transaction
2467  * Returns <0 on error and aborts the transaction
2468  */
2469 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2470                            struct btrfs_root *root, unsigned long count)
2471 {
2472         struct rb_node *node;
2473         struct btrfs_delayed_ref_root *delayed_refs;
2474         struct btrfs_delayed_ref_node *ref;
2475         struct list_head cluster;
2476         int ret;
2477         u64 delayed_start;
2478         int run_all = count == (unsigned long)-1;
2479         int run_most = 0;
2480         int loops;
2481
2482         /* We'll clean this up in btrfs_cleanup_transaction */
2483         if (trans->aborted)
2484                 return 0;
2485
2486         if (root == root->fs_info->extent_root)
2487                 root = root->fs_info->tree_root;
2488
2489         btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2490
2491         delayed_refs = &trans->transaction->delayed_refs;
2492         INIT_LIST_HEAD(&cluster);
2493         if (count == 0) {
2494                 count = delayed_refs->num_entries * 2;
2495                 run_most = 1;
2496         }
2497
2498         if (!run_all && !run_most) {
2499                 int old;
2500                 int seq = atomic_read(&delayed_refs->ref_seq);
2501
2502 progress:
2503                 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2504                 if (old) {
2505                         DEFINE_WAIT(__wait);
2506                         if (delayed_refs->num_entries < 16348)
2507                                 return 0;
2508
2509                         prepare_to_wait(&delayed_refs->wait, &__wait,
2510                                         TASK_UNINTERRUPTIBLE);
2511
2512                         old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2513                         if (old) {
2514                                 schedule();
2515                                 finish_wait(&delayed_refs->wait, &__wait);
2516
2517                                 if (!refs_newer(delayed_refs, seq, 256))
2518                                         goto progress;
2519                                 else
2520                                         return 0;
2521                         } else {
2522                                 finish_wait(&delayed_refs->wait, &__wait);
2523                                 goto again;
2524                         }
2525                 }
2526
2527         } else {
2528                 atomic_inc(&delayed_refs->procs_running_refs);
2529         }
2530
2531 again:
2532         loops = 0;
2533         spin_lock(&delayed_refs->lock);
2534
2535 #ifdef SCRAMBLE_DELAYED_REFS
2536         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2537 #endif
2538
2539         while (1) {
2540                 if (!(run_all || run_most) &&
2541                     delayed_refs->num_heads_ready < 64)
2542                         break;
2543
2544                 /*
2545                  * go find something we can process in the rbtree.  We start at
2546                  * the beginning of the tree, and then build a cluster
2547                  * of refs to process starting at the first one we are able to
2548                  * lock
2549                  */
2550                 delayed_start = delayed_refs->run_delayed_start;
2551                 ret = btrfs_find_ref_cluster(trans, &cluster,
2552                                              delayed_refs->run_delayed_start);
2553                 if (ret)
2554                         break;
2555
2556                 ret = run_clustered_refs(trans, root, &cluster);
2557                 if (ret < 0) {
2558                         btrfs_release_ref_cluster(&cluster);
2559                         spin_unlock(&delayed_refs->lock);
2560                         btrfs_abort_transaction(trans, root, ret);
2561                         atomic_dec(&delayed_refs->procs_running_refs);
2562                         return ret;
2563                 }
2564
2565                 atomic_add(ret, &delayed_refs->ref_seq);
2566
2567                 count -= min_t(unsigned long, ret, count);
2568
2569                 if (count == 0)
2570                         break;
2571
2572                 if (delayed_start >= delayed_refs->run_delayed_start) {
2573                         if (loops == 0) {
2574                                 /*
2575                                  * btrfs_find_ref_cluster looped. let's do one
2576                                  * more cycle. if we don't run any delayed ref
2577                                  * during that cycle (because we can't because
2578                                  * all of them are blocked), bail out.
2579                                  */
2580                                 loops = 1;
2581                         } else {
2582                                 /*
2583                                  * no runnable refs left, stop trying
2584                                  */
2585                                 BUG_ON(run_all);
2586                                 break;
2587                         }
2588                 }
2589                 if (ret) {
2590                         /* refs were run, let's reset staleness detection */
2591                         loops = 0;
2592                 }
2593         }
2594
2595         if (run_all) {
2596                 if (!list_empty(&trans->new_bgs)) {
2597                         spin_unlock(&delayed_refs->lock);
2598                         btrfs_create_pending_block_groups(trans, root);
2599                         spin_lock(&delayed_refs->lock);
2600                 }
2601
2602                 node = rb_first(&delayed_refs->root);
2603                 if (!node)
2604                         goto out;
2605                 count = (unsigned long)-1;
2606
2607                 while (node) {
2608                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2609                                        rb_node);
2610                         if (btrfs_delayed_ref_is_head(ref)) {
2611                                 struct btrfs_delayed_ref_head *head;
2612
2613                                 head = btrfs_delayed_node_to_head(ref);
2614                                 atomic_inc(&ref->refs);
2615
2616                                 spin_unlock(&delayed_refs->lock);
2617                                 /*
2618                                  * Mutex was contended, block until it's
2619                                  * released and try again
2620                                  */
2621                                 mutex_lock(&head->mutex);
2622                                 mutex_unlock(&head->mutex);
2623
2624                                 btrfs_put_delayed_ref(ref);
2625                                 cond_resched();
2626                                 goto again;
2627                         }
2628                         node = rb_next(node);
2629                 }
2630                 spin_unlock(&delayed_refs->lock);
2631                 schedule_timeout(1);
2632                 goto again;
2633         }
2634 out:
2635         atomic_dec(&delayed_refs->procs_running_refs);
2636         smp_mb();
2637         if (waitqueue_active(&delayed_refs->wait))
2638                 wake_up(&delayed_refs->wait);
2639
2640         spin_unlock(&delayed_refs->lock);
2641         assert_qgroups_uptodate(trans);
2642         return 0;
2643 }
2644
2645 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2646                                 struct btrfs_root *root,
2647                                 u64 bytenr, u64 num_bytes, u64 flags,
2648                                 int is_data)
2649 {
2650         struct btrfs_delayed_extent_op *extent_op;
2651         int ret;
2652
2653         extent_op = btrfs_alloc_delayed_extent_op();
2654         if (!extent_op)
2655                 return -ENOMEM;
2656
2657         extent_op->flags_to_set = flags;
2658         extent_op->update_flags = 1;
2659         extent_op->update_key = 0;
2660         extent_op->is_data = is_data ? 1 : 0;
2661
2662         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2663                                           num_bytes, extent_op);
2664         if (ret)
2665                 btrfs_free_delayed_extent_op(extent_op);
2666         return ret;
2667 }
2668
2669 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2670                                       struct btrfs_root *root,
2671                                       struct btrfs_path *path,
2672                                       u64 objectid, u64 offset, u64 bytenr)
2673 {
2674         struct btrfs_delayed_ref_head *head;
2675         struct btrfs_delayed_ref_node *ref;
2676         struct btrfs_delayed_data_ref *data_ref;
2677         struct btrfs_delayed_ref_root *delayed_refs;
2678         struct rb_node *node;
2679         int ret = 0;
2680
2681         ret = -ENOENT;
2682         delayed_refs = &trans->transaction->delayed_refs;
2683         spin_lock(&delayed_refs->lock);
2684         head = btrfs_find_delayed_ref_head(trans, bytenr);
2685         if (!head)
2686                 goto out;
2687
2688         if (!mutex_trylock(&head->mutex)) {
2689                 atomic_inc(&head->node.refs);
2690                 spin_unlock(&delayed_refs->lock);
2691
2692                 btrfs_release_path(path);
2693
2694                 /*
2695                  * Mutex was contended, block until it's released and let
2696                  * caller try again
2697                  */
2698                 mutex_lock(&head->mutex);
2699                 mutex_unlock(&head->mutex);
2700                 btrfs_put_delayed_ref(&head->node);
2701                 return -EAGAIN;
2702         }
2703
2704         node = rb_prev(&head->node.rb_node);
2705         if (!node)
2706                 goto out_unlock;
2707
2708         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2709
2710         if (ref->bytenr != bytenr)
2711                 goto out_unlock;
2712
2713         ret = 1;
2714         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2715                 goto out_unlock;
2716
2717         data_ref = btrfs_delayed_node_to_data_ref(ref);
2718
2719         node = rb_prev(node);
2720         if (node) {
2721                 int seq = ref->seq;
2722
2723                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2724                 if (ref->bytenr == bytenr && ref->seq == seq)
2725                         goto out_unlock;
2726         }
2727
2728         if (data_ref->root != root->root_key.objectid ||
2729             data_ref->objectid != objectid || data_ref->offset != offset)
2730                 goto out_unlock;
2731
2732         ret = 0;
2733 out_unlock:
2734         mutex_unlock(&head->mutex);
2735 out:
2736         spin_unlock(&delayed_refs->lock);
2737         return ret;
2738 }
2739
2740 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2741                                         struct btrfs_root *root,
2742                                         struct btrfs_path *path,
2743                                         u64 objectid, u64 offset, u64 bytenr)
2744 {
2745         struct btrfs_root *extent_root = root->fs_info->extent_root;
2746         struct extent_buffer *leaf;
2747         struct btrfs_extent_data_ref *ref;
2748         struct btrfs_extent_inline_ref *iref;
2749         struct btrfs_extent_item *ei;
2750         struct btrfs_key key;
2751         u32 item_size;
2752         int ret;
2753
2754         key.objectid = bytenr;
2755         key.offset = (u64)-1;
2756         key.type = BTRFS_EXTENT_ITEM_KEY;
2757
2758         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2759         if (ret < 0)
2760                 goto out;
2761         BUG_ON(ret == 0); /* Corruption */
2762
2763         ret = -ENOENT;
2764         if (path->slots[0] == 0)
2765                 goto out;
2766
2767         path->slots[0]--;
2768         leaf = path->nodes[0];
2769         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2770
2771         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2772                 goto out;
2773
2774         ret = 1;
2775         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2776 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2777         if (item_size < sizeof(*ei)) {
2778                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2779                 goto out;
2780         }
2781 #endif
2782         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2783
2784         if (item_size != sizeof(*ei) +
2785             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2786                 goto out;
2787
2788         if (btrfs_extent_generation(leaf, ei) <=
2789             btrfs_root_last_snapshot(&root->root_item))
2790                 goto out;
2791
2792         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2793         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2794             BTRFS_EXTENT_DATA_REF_KEY)
2795                 goto out;
2796
2797         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2798         if (btrfs_extent_refs(leaf, ei) !=
2799             btrfs_extent_data_ref_count(leaf, ref) ||
2800             btrfs_extent_data_ref_root(leaf, ref) !=
2801             root->root_key.objectid ||
2802             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2803             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2804                 goto out;
2805
2806         ret = 0;
2807 out:
2808         return ret;
2809 }
2810
2811 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2812                           struct btrfs_root *root,
2813                           u64 objectid, u64 offset, u64 bytenr)
2814 {
2815         struct btrfs_path *path;
2816         int ret;
2817         int ret2;
2818
2819         path = btrfs_alloc_path();
2820         if (!path)
2821                 return -ENOENT;
2822
2823         do {
2824                 ret = check_committed_ref(trans, root, path, objectid,
2825                                           offset, bytenr);
2826                 if (ret && ret != -ENOENT)
2827                         goto out;
2828
2829                 ret2 = check_delayed_ref(trans, root, path, objectid,
2830                                          offset, bytenr);
2831         } while (ret2 == -EAGAIN);
2832
2833         if (ret2 && ret2 != -ENOENT) {
2834                 ret = ret2;
2835                 goto out;
2836         }
2837
2838         if (ret != -ENOENT || ret2 != -ENOENT)
2839                 ret = 0;
2840 out:
2841         btrfs_free_path(path);
2842         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2843                 WARN_ON(ret > 0);
2844         return ret;
2845 }
2846
2847 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2848                            struct btrfs_root *root,
2849                            struct extent_buffer *buf,
2850                            int full_backref, int inc, int for_cow)
2851 {
2852         u64 bytenr;
2853         u64 num_bytes;
2854         u64 parent;
2855         u64 ref_root;
2856         u32 nritems;
2857         struct btrfs_key key;
2858         struct btrfs_file_extent_item *fi;
2859         int i;
2860         int level;
2861         int ret = 0;
2862         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2863                             u64, u64, u64, u64, u64, u64, int);
2864
2865         ref_root = btrfs_header_owner(buf);
2866         nritems = btrfs_header_nritems(buf);
2867         level = btrfs_header_level(buf);
2868
2869         if (!root->ref_cows && level == 0)
2870                 return 0;
2871
2872         if (inc)
2873                 process_func = btrfs_inc_extent_ref;
2874         else
2875                 process_func = btrfs_free_extent;
2876
2877         if (full_backref)
2878                 parent = buf->start;
2879         else
2880                 parent = 0;
2881
2882         for (i = 0; i < nritems; i++) {
2883                 if (level == 0) {
2884                         btrfs_item_key_to_cpu(buf, &key, i);
2885                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2886                                 continue;
2887                         fi = btrfs_item_ptr(buf, i,
2888                                             struct btrfs_file_extent_item);
2889                         if (btrfs_file_extent_type(buf, fi) ==
2890                             BTRFS_FILE_EXTENT_INLINE)
2891                                 continue;
2892                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2893                         if (bytenr == 0)
2894                                 continue;
2895
2896                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2897                         key.offset -= btrfs_file_extent_offset(buf, fi);
2898                         ret = process_func(trans, root, bytenr, num_bytes,
2899                                            parent, ref_root, key.objectid,
2900                                            key.offset, for_cow);
2901                         if (ret)
2902                                 goto fail;
2903                 } else {
2904                         bytenr = btrfs_node_blockptr(buf, i);
2905                         num_bytes = btrfs_level_size(root, level - 1);
2906                         ret = process_func(trans, root, bytenr, num_bytes,
2907                                            parent, ref_root, level - 1, 0,
2908                                            for_cow);
2909                         if (ret)
2910                                 goto fail;
2911                 }
2912         }
2913         return 0;
2914 fail:
2915         return ret;
2916 }
2917
2918 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2919                   struct extent_buffer *buf, int full_backref, int for_cow)
2920 {
2921         return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2922 }
2923
2924 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2925                   struct extent_buffer *buf, int full_backref, int for_cow)
2926 {
2927         return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2928 }
2929
2930 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2931                                  struct btrfs_root *root,
2932                                  struct btrfs_path *path,
2933                                  struct btrfs_block_group_cache *cache)
2934 {
2935         int ret;
2936         struct btrfs_root *extent_root = root->fs_info->extent_root;
2937         unsigned long bi;
2938         struct extent_buffer *leaf;
2939
2940         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2941         if (ret < 0)
2942                 goto fail;
2943         BUG_ON(ret); /* Corruption */
2944
2945         leaf = path->nodes[0];
2946         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2947         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2948         btrfs_mark_buffer_dirty(leaf);
2949         btrfs_release_path(path);
2950 fail:
2951         if (ret) {
2952                 btrfs_abort_transaction(trans, root, ret);
2953                 return ret;
2954         }
2955         return 0;
2956
2957 }
2958
2959 static struct btrfs_block_group_cache *
2960 next_block_group(struct btrfs_root *root,
2961                  struct btrfs_block_group_cache *cache)
2962 {
2963         struct rb_node *node;
2964         spin_lock(&root->fs_info->block_group_cache_lock);
2965         node = rb_next(&cache->cache_node);
2966         btrfs_put_block_group(cache);
2967         if (node) {
2968                 cache = rb_entry(node, struct btrfs_block_group_cache,
2969                                  cache_node);
2970                 btrfs_get_block_group(cache);
2971         } else
2972                 cache = NULL;
2973         spin_unlock(&root->fs_info->block_group_cache_lock);
2974         return cache;
2975 }
2976
2977 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2978                             struct btrfs_trans_handle *trans,
2979                             struct btrfs_path *path)
2980 {
2981         struct btrfs_root *root = block_group->fs_info->tree_root;
2982         struct inode *inode = NULL;
2983         u64 alloc_hint = 0;
2984         int dcs = BTRFS_DC_ERROR;
2985         int num_pages = 0;
2986         int retries = 0;
2987         int ret = 0;
2988
2989         /*
2990          * If this block group is smaller than 100 megs don't bother caching the
2991          * block group.
2992          */
2993         if (block_group->key.offset < (100 * 1024 * 1024)) {
2994                 spin_lock(&block_group->lock);
2995                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2996                 spin_unlock(&block_group->lock);
2997                 return 0;
2998         }
2999
3000 again:
3001         inode = lookup_free_space_inode(root, block_group, path);
3002         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3003                 ret = PTR_ERR(inode);
3004                 btrfs_release_path(path);
3005                 goto out;
3006         }
3007
3008         if (IS_ERR(inode)) {
3009                 BUG_ON(retries);
3010                 retries++;
3011
3012                 if (block_group->ro)
3013                         goto out_free;
3014
3015                 ret = create_free_space_inode(root, trans, block_group, path);
3016                 if (ret)
3017                         goto out_free;
3018                 goto again;
3019         }
3020
3021         /* We've already setup this transaction, go ahead and exit */
3022         if (block_group->cache_generation == trans->transid &&
3023             i_size_read(inode)) {
3024                 dcs = BTRFS_DC_SETUP;
3025                 goto out_put;
3026         }
3027
3028         /*
3029          * We want to set the generation to 0, that way if anything goes wrong
3030          * from here on out we know not to trust this cache when we load up next
3031          * time.
3032          */
3033         BTRFS_I(inode)->generation = 0;
3034         ret = btrfs_update_inode(trans, root, inode);
3035         WARN_ON(ret);
3036
3037         if (i_size_read(inode) > 0) {
3038                 ret = btrfs_truncate_free_space_cache(root, trans, path,
3039                                                       inode);
3040                 if (ret)
3041                         goto out_put;
3042         }
3043
3044         spin_lock(&block_group->lock);
3045         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3046             !btrfs_test_opt(root, SPACE_CACHE)) {
3047                 /*
3048                  * don't bother trying to write stuff out _if_
3049                  * a) we're not cached,
3050                  * b) we're with nospace_cache mount option.
3051                  */
3052                 dcs = BTRFS_DC_WRITTEN;
3053                 spin_unlock(&block_group->lock);
3054                 goto out_put;
3055         }
3056         spin_unlock(&block_group->lock);
3057
3058         /*
3059          * Try to preallocate enough space based on how big the block group is.
3060          * Keep in mind this has to include any pinned space which could end up
3061          * taking up quite a bit since it's not folded into the other space
3062          * cache.
3063          */
3064         num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3065         if (!num_pages)
3066                 num_pages = 1;
3067
3068         num_pages *= 16;
3069         num_pages *= PAGE_CACHE_SIZE;
3070
3071         ret = btrfs_check_data_free_space(inode, num_pages);
3072         if (ret)
3073                 goto out_put;
3074
3075         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3076                                               num_pages, num_pages,
3077                                               &alloc_hint);
3078         if (!ret)
3079                 dcs = BTRFS_DC_SETUP;
3080         btrfs_free_reserved_data_space(inode, num_pages);
3081
3082 out_put:
3083         iput(inode);
3084 out_free:
3085         btrfs_release_path(path);
3086 out:
3087         spin_lock(&block_group->lock);
3088         if (!ret && dcs == BTRFS_DC_SETUP)
3089                 block_group->cache_generation = trans->transid;
3090         block_group->disk_cache_state = dcs;
3091         spin_unlock(&block_group->lock);
3092
3093         return ret;
3094 }
3095
3096 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3097                                    struct btrfs_root *root)
3098 {
3099         struct btrfs_block_group_cache *cache;
3100         int err = 0;
3101         struct btrfs_path *path;
3102         u64 last = 0;
3103
3104         path = btrfs_alloc_path();
3105         if (!path)
3106                 return -ENOMEM;
3107
3108 again:
3109         while (1) {
3110                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3111                 while (cache) {
3112                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3113                                 break;
3114                         cache = next_block_group(root, cache);
3115                 }
3116                 if (!cache) {
3117                         if (last == 0)
3118                                 break;
3119                         last = 0;
3120                         continue;
3121                 }
3122                 err = cache_save_setup(cache, trans, path);
3123                 last = cache->key.objectid + cache->key.offset;
3124                 btrfs_put_block_group(cache);
3125         }
3126
3127         while (1) {
3128                 if (last == 0) {
3129                         err = btrfs_run_delayed_refs(trans, root,
3130                                                      (unsigned long)-1);
3131                         if (err) /* File system offline */
3132                                 goto out;
3133                 }
3134
3135                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3136                 while (cache) {
3137                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3138                                 btrfs_put_block_group(cache);
3139                                 goto again;
3140                         }
3141
3142                         if (cache->dirty)
3143                                 break;
3144                         cache = next_block_group(root, cache);
3145                 }
3146                 if (!cache) {
3147                         if (last == 0)
3148                                 break;
3149                         last = 0;
3150                         continue;
3151                 }
3152
3153                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
3154                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3155                 cache->dirty = 0;
3156                 last = cache->key.objectid + cache->key.offset;
3157
3158                 err = write_one_cache_group(trans, root, path, cache);
3159                 if (err) /* File system offline */
3160                         goto out;
3161
3162                 btrfs_put_block_group(cache);
3163         }
3164
3165         while (1) {
3166                 /*
3167                  * I don't think this is needed since we're just marking our
3168                  * preallocated extent as written, but just in case it can't
3169                  * hurt.
3170                  */
3171                 if (last == 0) {
3172                         err = btrfs_run_delayed_refs(trans, root,
3173                                                      (unsigned long)-1);
3174                         if (err) /* File system offline */
3175                                 goto out;
3176                 }
3177
3178                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3179                 while (cache) {
3180                         /*
3181                          * Really this shouldn't happen, but it could if we
3182                          * couldn't write the entire preallocated extent and
3183                          * splitting the extent resulted in a new block.
3184                          */
3185                         if (cache->dirty) {
3186                                 btrfs_put_block_group(cache);
3187                                 goto again;
3188                         }
3189                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3190                                 break;
3191                         cache = next_block_group(root, cache);
3192                 }
3193                 if (!cache) {
3194                         if (last == 0)
3195                                 break;
3196                         last = 0;
3197                         continue;
3198                 }
3199
3200                 err = btrfs_write_out_cache(root, trans, cache, path);
3201
3202                 /*
3203                  * If we didn't have an error then the cache state is still
3204                  * NEED_WRITE, so we can set it to WRITTEN.
3205                  */
3206                 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3207                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
3208                 last = cache->key.objectid + cache->key.offset;
3209                 btrfs_put_block_group(cache);
3210         }
3211 out:
3212
3213         btrfs_free_path(path);
3214         return err;
3215 }
3216
3217 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3218 {
3219         struct btrfs_block_group_cache *block_group;
3220         int readonly = 0;
3221
3222         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3223         if (!block_group || block_group->ro)
3224                 readonly = 1;
3225         if (block_group)
3226                 btrfs_put_block_group(block_group);
3227         return readonly;
3228 }
3229
3230 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3231                              u64 total_bytes, u64 bytes_used,
3232                              struct btrfs_space_info **space_info)
3233 {
3234         struct btrfs_space_info *found;
3235         int i;
3236         int factor;
3237
3238         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3239                      BTRFS_BLOCK_GROUP_RAID10))
3240                 factor = 2;
3241         else
3242                 factor = 1;
3243
3244         found = __find_space_info(info, flags);
3245         if (found) {
3246                 spin_lock(&found->lock);
3247                 found->total_bytes += total_bytes;
3248                 found->disk_total += total_bytes * factor;
3249                 found->bytes_used += bytes_used;
3250                 found->disk_used += bytes_used * factor;
3251                 found->full = 0;
3252                 spin_unlock(&found->lock);
3253                 *space_info = found;
3254                 return 0;
3255         }
3256         found = kzalloc(sizeof(*found), GFP_NOFS);
3257         if (!found)
3258                 return -ENOMEM;
3259
3260         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3261                 INIT_LIST_HEAD(&found->block_groups[i]);
3262         init_rwsem(&found->groups_sem);
3263         spin_lock_init(&found->lock);
3264         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3265         found->total_bytes = total_bytes;
3266         found->disk_total = total_bytes * factor;
3267         found->bytes_used = bytes_used;
3268         found->disk_used = bytes_used * factor;
3269         found->bytes_pinned = 0;
3270         found->bytes_reserved = 0;
3271         found->bytes_readonly = 0;
3272         found->bytes_may_use = 0;
3273         found->full = 0;
3274         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3275         found->chunk_alloc = 0;
3276         found->flush = 0;
3277         init_waitqueue_head(&found->wait);
3278         *space_info = found;
3279         list_add_rcu(&found->list, &info->space_info);
3280         if (flags & BTRFS_BLOCK_GROUP_DATA)
3281                 info->data_sinfo = found;
3282         return 0;
3283 }
3284
3285 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3286 {
3287         u64 extra_flags = chunk_to_extended(flags) &
3288                                 BTRFS_EXTENDED_PROFILE_MASK;
3289
3290         write_seqlock(&fs_info->profiles_lock);
3291         if (flags & BTRFS_BLOCK_GROUP_DATA)
3292                 fs_info->avail_data_alloc_bits |= extra_flags;
3293         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3294                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3295         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3296                 fs_info->avail_system_alloc_bits |= extra_flags;
3297         write_sequnlock(&fs_info->profiles_lock);
3298 }
3299
3300 /*
3301  * returns target flags in extended format or 0 if restripe for this
3302  * chunk_type is not in progress
3303  *
3304  * should be called with either volume_mutex or balance_lock held
3305  */
3306 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3307 {
3308         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3309         u64 target = 0;
3310
3311         if (!bctl)
3312                 return 0;
3313
3314         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3315             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3316                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3317         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3318                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3319                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3320         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3321                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3322                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3323         }
3324
3325         return target;
3326 }
3327
3328 /*
3329  * @flags: available profiles in extended format (see ctree.h)
3330  *
3331  * Returns reduced profile in chunk format.  If profile changing is in
3332  * progress (either running or paused) picks the target profile (if it's
3333  * already available), otherwise falls back to plain reducing.
3334  */
3335 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3336 {
3337         /*
3338          * we add in the count of missing devices because we want
3339          * to make sure that any RAID levels on a degraded FS
3340          * continue to be honored.
3341          */
3342         u64 num_devices = root->fs_info->fs_devices->rw_devices +
3343                 root->fs_info->fs_devices->missing_devices;
3344         u64 target;
3345         u64 tmp;
3346
3347         /*
3348          * see if restripe for this chunk_type is in progress, if so
3349          * try to reduce to the target profile
3350          */
3351         spin_lock(&root->fs_info->balance_lock);
3352         target = get_restripe_target(root->fs_info, flags);
3353         if (target) {
3354                 /* pick target profile only if it's already available */
3355                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3356                         spin_unlock(&root->fs_info->balance_lock);
3357                         return extended_to_chunk(target);
3358                 }
3359         }
3360         spin_unlock(&root->fs_info->balance_lock);
3361
3362         /* First, mask out the RAID levels which aren't possible */
3363         if (num_devices == 1)
3364                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3365                            BTRFS_BLOCK_GROUP_RAID5);
3366         if (num_devices < 3)
3367                 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3368         if (num_devices < 4)
3369                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3370
3371         tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3372                        BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3373                        BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3374         flags &= ~tmp;
3375
3376         if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3377                 tmp = BTRFS_BLOCK_GROUP_RAID6;
3378         else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3379                 tmp = BTRFS_BLOCK_GROUP_RAID5;
3380         else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3381                 tmp = BTRFS_BLOCK_GROUP_RAID10;
3382         else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3383                 tmp = BTRFS_BLOCK_GROUP_RAID1;
3384         else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3385                 tmp = BTRFS_BLOCK_GROUP_RAID0;
3386
3387         return extended_to_chunk(flags | tmp);
3388 }
3389
3390 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3391 {
3392         unsigned seq;
3393
3394         do {
3395                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3396
3397                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3398                         flags |= root->fs_info->avail_data_alloc_bits;
3399                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3400                         flags |= root->fs_info->avail_system_alloc_bits;
3401                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3402                         flags |= root->fs_info->avail_metadata_alloc_bits;
3403         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3404
3405         return btrfs_reduce_alloc_profile(root, flags);
3406 }
3407
3408 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3409 {
3410         u64 flags;
3411         u64 ret;
3412
3413         if (data)
3414                 flags = BTRFS_BLOCK_GROUP_DATA;
3415         else if (root == root->fs_info->chunk_root)
3416                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3417         else
3418                 flags = BTRFS_BLOCK_GROUP_METADATA;
3419
3420         ret = get_alloc_profile(root, flags);
3421         return ret;
3422 }
3423
3424 /*
3425  * This will check the space that the inode allocates from to make sure we have
3426  * enough space for bytes.
3427  */
3428 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3429 {
3430         struct btrfs_space_info *data_sinfo;
3431         struct btrfs_root *root = BTRFS_I(inode)->root;
3432         struct btrfs_fs_info *fs_info = root->fs_info;
3433         u64 used;
3434         int ret = 0, committed = 0, alloc_chunk = 1;
3435
3436         /* make sure bytes are sectorsize aligned */
3437         bytes = ALIGN(bytes, root->sectorsize);
3438
3439         if (root == root->fs_info->tree_root ||
3440             BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3441                 alloc_chunk = 0;
3442                 committed = 1;
3443         }
3444
3445         data_sinfo = fs_info->data_sinfo;
3446         if (!data_sinfo)
3447                 goto alloc;
3448
3449 again:
3450         /* make sure we have enough space to handle the data first */
3451         spin_lock(&data_sinfo->lock);
3452         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3453                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3454                 data_sinfo->bytes_may_use;
3455
3456         if (used + bytes > data_sinfo->total_bytes) {
3457                 struct btrfs_trans_handle *trans;
3458
3459                 /*
3460                  * if we don't have enough free bytes in this space then we need
3461                  * to alloc a new chunk.
3462                  */
3463                 if (!data_sinfo->full && alloc_chunk) {
3464                         u64 alloc_target;
3465
3466                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3467                         spin_unlock(&data_sinfo->lock);
3468 alloc:
3469                         alloc_target = btrfs_get_alloc_profile(root, 1);
3470                         trans = btrfs_join_transaction(root);
3471                         if (IS_ERR(trans))
3472                                 return PTR_ERR(trans);
3473
3474                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3475                                              alloc_target,
3476                                              CHUNK_ALLOC_NO_FORCE);
3477                         btrfs_end_transaction(trans, root);
3478                         if (ret < 0) {
3479                                 if (ret != -ENOSPC)
3480                                         return ret;
3481                                 else
3482                                         goto commit_trans;
3483                         }
3484
3485                         if (!data_sinfo)
3486                                 data_sinfo = fs_info->data_sinfo;
3487
3488                         goto again;
3489                 }
3490
3491                 /*
3492                  * If we have less pinned bytes than we want to allocate then
3493                  * don't bother committing the transaction, it won't help us.
3494                  */
3495                 if (data_sinfo->bytes_pinned < bytes)
3496                         committed = 1;
3497                 spin_unlock(&data_sinfo->lock);
3498
3499                 /* commit the current transaction and try again */
3500 commit_trans:
3501                 if (!committed &&
3502                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
3503                         committed = 1;
3504                         trans = btrfs_join_transaction(root);
3505                         if (IS_ERR(trans))
3506                                 return PTR_ERR(trans);
3507                         ret = btrfs_commit_transaction(trans, root);
3508                         if (ret)
3509                                 return ret;
3510                         goto again;
3511                 }
3512
3513                 return -ENOSPC;
3514         }
3515         data_sinfo->bytes_may_use += bytes;
3516         trace_btrfs_space_reservation(root->fs_info, "space_info",
3517                                       data_sinfo->flags, bytes, 1);
3518         spin_unlock(&data_sinfo->lock);
3519
3520         return 0;
3521 }
3522
3523 /*
3524  * Called if we need to clear a data reservation for this inode.
3525  */
3526 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3527 {
3528         struct btrfs_root *root = BTRFS_I(inode)->root;
3529         struct btrfs_space_info *data_sinfo;
3530
3531         /* make sure bytes are sectorsize aligned */
3532         bytes = ALIGN(bytes, root->sectorsize);
3533
3534         data_sinfo = root->fs_info->data_sinfo;
3535         spin_lock(&data_sinfo->lock);
3536         data_sinfo->bytes_may_use -= bytes;
3537         trace_btrfs_space_reservation(root->fs_info, "space_info",
3538                                       data_sinfo->flags, bytes, 0);
3539         spin_unlock(&data_sinfo->lock);
3540 }
3541
3542 static void force_metadata_allocation(struct btrfs_fs_info *info)
3543 {
3544         struct list_head *head = &info->space_info;
3545         struct btrfs_space_info *found;
3546
3547         rcu_read_lock();
3548         list_for_each_entry_rcu(found, head, list) {
3549                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3550                         found->force_alloc = CHUNK_ALLOC_FORCE;
3551         }
3552         rcu_read_unlock();
3553 }
3554
3555 static int should_alloc_chunk(struct btrfs_root *root,
3556                               struct btrfs_space_info *sinfo, int force)
3557 {
3558         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3559         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3560         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3561         u64 thresh;
3562
3563         if (force == CHUNK_ALLOC_FORCE)
3564                 return 1;
3565
3566         /*
3567          * We need to take into account the global rsv because for all intents
3568          * and purposes it's used space.  Don't worry about locking the
3569          * global_rsv, it doesn't change except when the transaction commits.
3570          */
3571         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3572                 num_allocated += global_rsv->size;
3573
3574         /*
3575          * in limited mode, we want to have some free space up to
3576          * about 1% of the FS size.
3577          */
3578         if (force == CHUNK_ALLOC_LIMITED) {
3579                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3580                 thresh = max_t(u64, 64 * 1024 * 1024,
3581                                div_factor_fine(thresh, 1));
3582
3583                 if (num_bytes - num_allocated < thresh)
3584                         return 1;
3585         }
3586
3587         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3588                 return 0;
3589         return 1;
3590 }
3591
3592 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3593 {
3594         u64 num_dev;
3595
3596         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3597                     BTRFS_BLOCK_GROUP_RAID0 |
3598                     BTRFS_BLOCK_GROUP_RAID5 |
3599                     BTRFS_BLOCK_GROUP_RAID6))
3600                 num_dev = root->fs_info->fs_devices->rw_devices;
3601         else if (type & BTRFS_BLOCK_GROUP_RAID1)
3602                 num_dev = 2;
3603         else
3604                 num_dev = 1;    /* DUP or single */
3605
3606         /* metadata for updaing devices and chunk tree */
3607         return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3608 }
3609
3610 static void check_system_chunk(struct btrfs_trans_handle *trans,
3611                                struct btrfs_root *root, u64 type)
3612 {
3613         struct btrfs_space_info *info;
3614         u64 left;
3615         u64 thresh;
3616
3617         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3618         spin_lock(&info->lock);
3619         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3620                 info->bytes_reserved - info->bytes_readonly;
3621         spin_unlock(&info->lock);
3622
3623         thresh = get_system_chunk_thresh(root, type);
3624         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3625                 printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3626                        left, thresh, type);
3627                 dump_space_info(info, 0, 0);
3628         }
3629
3630         if (left < thresh) {
3631                 u64 flags;
3632
3633                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3634                 btrfs_alloc_chunk(trans, root, flags);
3635         }
3636 }
3637
3638 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3639                           struct btrfs_root *extent_root, u64 flags, int force)
3640 {
3641         struct btrfs_space_info *space_info;
3642         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3643         int wait_for_alloc = 0;
3644         int ret = 0;
3645
3646         /* Don't re-enter if we're already allocating a chunk */
3647         if (trans->allocating_chunk)
3648                 return -ENOSPC;
3649
3650         space_info = __find_space_info(extent_root->fs_info, flags);
3651         if (!space_info) {
3652                 ret = update_space_info(extent_root->fs_info, flags,
3653                                         0, 0, &space_info);
3654                 BUG_ON(ret); /* -ENOMEM */
3655         }
3656         BUG_ON(!space_info); /* Logic error */
3657
3658 again:
3659         spin_lock(&space_info->lock);
3660         if (force < space_info->force_alloc)
3661                 force = space_info->force_alloc;
3662         if (space_info->full) {
3663                 spin_unlock(&space_info->lock);
3664                 return 0;
3665         }
3666
3667         if (!should_alloc_chunk(extent_root, space_info, force)) {
3668                 spin_unlock(&space_info->lock);
3669                 return 0;
3670         } else if (space_info->chunk_alloc) {
3671                 wait_for_alloc = 1;
3672         } else {
3673                 space_info->chunk_alloc = 1;
3674         }
3675
3676         spin_unlock(&space_info->lock);
3677
3678         mutex_lock(&fs_info->chunk_mutex);
3679
3680         /*
3681          * The chunk_mutex is held throughout the entirety of a chunk
3682          * allocation, so once we've acquired the chunk_mutex we know that the
3683          * other guy is done and we need to recheck and see if we should
3684          * allocate.
3685          */
3686         if (wait_for_alloc) {
3687                 mutex_unlock(&fs_info->chunk_mutex);
3688                 wait_for_alloc = 0;
3689                 goto again;
3690         }
3691
3692         trans->allocating_chunk = true;
3693
3694         /*
3695          * If we have mixed data/metadata chunks we want to make sure we keep
3696          * allocating mixed chunks instead of individual chunks.
3697          */
3698         if (btrfs_mixed_space_info(space_info))
3699                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3700
3701         /*
3702          * if we're doing a data chunk, go ahead and make sure that
3703          * we keep a reasonable number of metadata chunks allocated in the
3704          * FS as well.
3705          */
3706         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3707                 fs_info->data_chunk_allocations++;
3708                 if (!(fs_info->data_chunk_allocations %
3709                       fs_info->metadata_ratio))
3710                         force_metadata_allocation(fs_info);
3711         }
3712
3713         /*
3714          * Check if we have enough space in SYSTEM chunk because we may need
3715          * to update devices.
3716          */
3717         check_system_chunk(trans, extent_root, flags);
3718
3719         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3720         trans->allocating_chunk = false;
3721
3722         spin_lock(&space_info->lock);
3723         if (ret < 0 && ret != -ENOSPC)
3724                 goto out;
3725         if (ret)
3726                 space_info->full = 1;
3727         else
3728                 ret = 1;
3729
3730         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3731 out:
3732         space_info->chunk_alloc = 0;
3733         spin_unlock(&space_info->lock);
3734         mutex_unlock(&fs_info->chunk_mutex);
3735         return ret;
3736 }
3737
3738 static int can_overcommit(struct btrfs_root *root,
3739                           struct btrfs_space_info *space_info, u64 bytes,
3740                           enum btrfs_reserve_flush_enum flush)
3741 {
3742         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3743         u64 profile = btrfs_get_alloc_profile(root, 0);
3744         u64 rsv_size = 0;
3745         u64 avail;
3746         u64 used;
3747         u64 to_add;
3748
3749         used = space_info->bytes_used + space_info->bytes_reserved +
3750                 space_info->bytes_pinned + space_info->bytes_readonly;
3751
3752         spin_lock(&global_rsv->lock);
3753         rsv_size = global_rsv->size;
3754         spin_unlock(&global_rsv->lock);
3755
3756         /*
3757          * We only want to allow over committing if we have lots of actual space
3758          * free, but if we don't have enough space to handle the global reserve
3759          * space then we could end up having a real enospc problem when trying
3760          * to allocate a chunk or some other such important allocation.
3761          */
3762         rsv_size <<= 1;
3763         if (used + rsv_size >= space_info->total_bytes)
3764                 return 0;
3765
3766         used += space_info->bytes_may_use;
3767
3768         spin_lock(&root->fs_info->free_chunk_lock);
3769         avail = root->fs_info->free_chunk_space;
3770         spin_unlock(&root->fs_info->free_chunk_lock);
3771
3772         /*
3773          * If we have dup, raid1 or raid10 then only half of the free
3774          * space is actually useable.  For raid56, the space info used
3775          * doesn't include the parity drive, so we don't have to
3776          * change the math
3777          */
3778         if (profile & (BTRFS_BLOCK_GROUP_DUP |
3779                        BTRFS_BLOCK_GROUP_RAID1 |
3780                        BTRFS_BLOCK_GROUP_RAID10))
3781                 avail >>= 1;
3782
3783         to_add = space_info->total_bytes;
3784
3785         /*
3786          * If we aren't flushing all things, let us overcommit up to
3787          * 1/2th of the space. If we can flush, don't let us overcommit
3788          * too much, let it overcommit up to 1/8 of the space.
3789          */
3790         if (flush == BTRFS_RESERVE_FLUSH_ALL)
3791                 to_add >>= 3;
3792         else
3793                 to_add >>= 1;
3794
3795         /*
3796          * Limit the overcommit to the amount of free space we could possibly
3797          * allocate for chunks.
3798          */
3799         to_add = min(avail, to_add);
3800
3801         if (used + bytes < space_info->total_bytes + to_add)
3802                 return 1;
3803         return 0;
3804 }
3805
3806 static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3807                                                       unsigned long nr_pages,
3808                                                       enum wb_reason reason)
3809 {
3810         /* the flusher is dealing with the dirty inodes now. */
3811         if (writeback_in_progress(sb->s_bdi))
3812                 return 1;
3813
3814         if (down_read_trylock(&sb->s_umount)) {
3815                 writeback_inodes_sb_nr(sb, nr_pages, reason);
3816                 up_read(&sb->s_umount);
3817                 return 1;
3818         }
3819
3820         return 0;
3821 }
3822
3823 void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3824                                   unsigned long nr_pages)
3825 {
3826         struct super_block *sb = root->fs_info->sb;
3827         int started;
3828
3829         /* If we can not start writeback, just sync all the delalloc file. */
3830         started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages,
3831                                                       WB_REASON_FS_FREE_SPACE);
3832         if (!started) {
3833                 /*
3834                  * We needn't worry the filesystem going from r/w to r/o though
3835                  * we don't acquire ->s_umount mutex, because the filesystem
3836                  * should guarantee the delalloc inodes list be empty after
3837                  * the filesystem is readonly(all dirty pages are written to
3838                  * the disk).
3839                  */
3840                 btrfs_start_delalloc_inodes(root, 0);
3841                 btrfs_wait_ordered_extents(root, 0);
3842         }
3843 }
3844
3845 /*
3846  * shrink metadata reservation for delalloc
3847  */
3848 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3849                             bool wait_ordered)
3850 {
3851         struct btrfs_block_rsv *block_rsv;
3852         struct btrfs_space_info *space_info;
3853         struct btrfs_trans_handle *trans;
3854         u64 delalloc_bytes;
3855         u64 max_reclaim;
3856         long time_left;
3857         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3858         int loops = 0;
3859         enum btrfs_reserve_flush_enum flush;
3860
3861         trans = (struct btrfs_trans_handle *)current->journal_info;
3862         block_rsv = &root->fs_info->delalloc_block_rsv;
3863         space_info = block_rsv->space_info;
3864
3865         smp_mb();
3866         delalloc_bytes = percpu_counter_sum_positive(
3867                                                 &root->fs_info->delalloc_bytes);
3868         if (delalloc_bytes == 0) {
3869                 if (trans)
3870                         return;
3871                 btrfs_wait_ordered_extents(root, 0);
3872                 return;
3873         }
3874
3875         while (delalloc_bytes && loops < 3) {
3876                 max_reclaim = min(delalloc_bytes, to_reclaim);
3877                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3878                 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3879                 /*
3880                  * We need to wait for the async pages to actually start before
3881                  * we do anything.
3882                  */
3883                 wait_event(root->fs_info->async_submit_wait,
3884                            !atomic_read(&root->fs_info->async_delalloc_pages));
3885
3886                 if (!trans)
3887                         flush = BTRFS_RESERVE_FLUSH_ALL;
3888                 else
3889                         flush = BTRFS_RESERVE_NO_FLUSH;
3890                 spin_lock(&space_info->lock);
3891                 if (can_overcommit(root, space_info, orig, flush)) {
3892                         spin_unlock(&space_info->lock);
3893                         break;
3894                 }
3895                 spin_unlock(&space_info->lock);
3896
3897                 loops++;
3898                 if (wait_ordered && !trans) {
3899                         btrfs_wait_ordered_extents(root, 0);
3900                 } else {
3901                         time_left = schedule_timeout_killable(1);
3902                         if (time_left)
3903                                 break;
3904                 }
3905                 smp_mb();
3906                 delalloc_bytes = percpu_counter_sum_positive(
3907                                                 &root->fs_info->delalloc_bytes);
3908         }
3909 }
3910
3911 /**
3912  * maybe_commit_transaction - possibly commit the transaction if its ok to
3913  * @root - the root we're allocating for
3914  * @bytes - the number of bytes we want to reserve
3915  * @force - force the commit
3916  *
3917  * This will check to make sure that committing the transaction will actually
3918  * get us somewhere and then commit the transaction if it does.  Otherwise it
3919  * will return -ENOSPC.
3920  */
3921 static int may_commit_transaction(struct btrfs_root *root,
3922                                   struct btrfs_space_info *space_info,
3923                                   u64 bytes, int force)
3924 {
3925         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3926         struct btrfs_trans_handle *trans;
3927
3928         trans = (struct btrfs_trans_handle *)current->journal_info;
3929         if (trans)
3930                 return -EAGAIN;
3931
3932         if (force)
3933                 goto commit;
3934
3935         /* See if there is enough pinned space to make this reservation */
3936         spin_lock(&space_info->lock);
3937         if (space_info->bytes_pinned >= bytes) {
3938                 spin_unlock(&space_info->lock);
3939                 goto commit;
3940         }
3941         spin_unlock(&space_info->lock);
3942
3943         /*
3944          * See if there is some space in the delayed insertion reservation for
3945          * this reservation.
3946          */
3947         if (space_info != delayed_rsv->space_info)
3948                 return -ENOSPC;
3949
3950         spin_lock(&space_info->lock);
3951         spin_lock(&delayed_rsv->lock);
3952         if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3953                 spin_unlock(&delayed_rsv->lock);
3954                 spin_unlock(&space_info->lock);
3955                 return -ENOSPC;
3956         }
3957         spin_unlock(&delayed_rsv->lock);
3958         spin_unlock(&space_info->lock);
3959
3960 commit:
3961         trans = btrfs_join_transaction(root);
3962         if (IS_ERR(trans))
3963                 return -ENOSPC;
3964
3965         return btrfs_commit_transaction(trans, root);
3966 }
3967
3968 enum flush_state {
3969         FLUSH_DELAYED_ITEMS_NR  =       1,
3970         FLUSH_DELAYED_ITEMS     =       2,
3971         FLUSH_DELALLOC          =       3,
3972         FLUSH_DELALLOC_WAIT     =       4,
3973         ALLOC_CHUNK             =       5,
3974         COMMIT_TRANS            =       6,
3975 };
3976
3977 static int flush_space(struct btrfs_root *root,
3978                        struct btrfs_space_info *space_info, u64 num_bytes,
3979                        u64 orig_bytes, int state)
3980 {
3981         struct btrfs_trans_handle *trans;
3982         int nr;
3983         int ret = 0;
3984
3985         switch (state) {
3986         case FLUSH_DELAYED_ITEMS_NR:
3987         case FLUSH_DELAYED_ITEMS:
3988                 if (state == FLUSH_DELAYED_ITEMS_NR) {
3989                         u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3990
3991                         nr = (int)div64_u64(num_bytes, bytes);
3992                         if (!nr)
3993                                 nr = 1;
3994                         nr *= 2;
3995                 } else {
3996                         nr = -1;
3997                 }
3998                 trans = btrfs_join_transaction(root);
3999                 if (IS_ERR(trans)) {
4000                         ret = PTR_ERR(trans);
4001                         break;
4002                 }
4003                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4004                 btrfs_end_transaction(trans, root);
4005                 break;
4006         case FLUSH_DELALLOC:
4007         case FLUSH_DELALLOC_WAIT:
4008                 shrink_delalloc(root, num_bytes, orig_bytes,
4009                                 state == FLUSH_DELALLOC_WAIT);
4010                 break;
4011         case ALLOC_CHUNK:
4012                 trans = btrfs_join_transaction(root);
4013                 if (IS_ERR(trans)) {
4014                         ret = PTR_ERR(trans);
4015                         break;
4016                 }
4017                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4018                                      btrfs_get_alloc_profile(root, 0),
4019                                      CHUNK_ALLOC_NO_FORCE);
4020                 btrfs_end_transaction(trans, root);
4021                 if (ret == -ENOSPC)
4022                         ret = 0;
4023                 break;
4024         case COMMIT_TRANS:
4025                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4026                 break;
4027         default:
4028                 ret = -ENOSPC;
4029                 break;
4030         }
4031
4032         return ret;
4033 }
4034 /**
4035  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4036  * @root - the root we're allocating for
4037  * @block_rsv - the block_rsv we're allocating for
4038  * @orig_bytes - the number of bytes we want
4039  * @flush - whether or not we can flush to make our reservation
4040  *
4041  * This will reserve orgi_bytes number of bytes from the space info associated
4042  * with the block_rsv.  If there is not enough space it will make an attempt to
4043  * flush out space to make room.  It will do this by flushing delalloc if
4044  * possible or committing the transaction.  If flush is 0 then no attempts to
4045  * regain reservations will be made and this will fail if there is not enough
4046  * space already.
4047  */
4048 static int reserve_metadata_bytes(struct btrfs_root *root,
4049                                   struct btrfs_block_rsv *block_rsv,
4050                                   u64 orig_bytes,
4051                                   enum btrfs_reserve_flush_enum flush)
4052 {
4053         struct btrfs_space_info *space_info = block_rsv->space_info;
4054         u64 used;
4055         u64 num_bytes = orig_bytes;
4056         int flush_state = FLUSH_DELAYED_ITEMS_NR;
4057         int ret = 0;
4058         bool flushing = false;
4059
4060 again:
4061         ret = 0;
4062         spin_lock(&space_info->lock);
4063         /*
4064          * We only want to wait if somebody other than us is flushing and we
4065          * are actually allowed to flush all things.
4066          */
4067         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4068                space_info->flush) {
4069                 spin_unlock(&space_info->lock);
4070                 /*
4071                  * If we have a trans handle we can't wait because the flusher
4072                  * may have to commit the transaction, which would mean we would
4073                  * deadlock since we are waiting for the flusher to finish, but
4074                  * hold the current transaction open.
4075                  */
4076                 if (current->journal_info)
4077                         return -EAGAIN;
4078                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4079                 /* Must have been killed, return */
4080                 if (ret)
4081                         return -EINTR;
4082
4083                 spin_lock(&space_info->lock);
4084         }
4085
4086         ret = -ENOSPC;
4087         used = space_info->bytes_used + space_info->bytes_reserved +
4088                 space_info->bytes_pinned + space_info->bytes_readonly +
4089                 space_info->bytes_may_use;
4090
4091         /*
4092          * The idea here is that we've not already over-reserved the block group
4093          * then we can go ahead and save our reservation first and then start
4094          * flushing if we need to.  Otherwise if we've already overcommitted
4095          * lets start flushing stuff first and then come back and try to make
4096          * our reservation.
4097          */
4098         if (used <= space_info->total_bytes) {
4099                 if (used + orig_bytes <= space_info->total_bytes) {
4100                         space_info->bytes_may_use += orig_bytes;
4101                         trace_btrfs_space_reservation(root->fs_info,
4102                                 "space_info", space_info->flags, orig_bytes, 1);
4103                         ret = 0;
4104                 } else {
4105                         /*
4106                          * Ok set num_bytes to orig_bytes since we aren't
4107                          * overocmmitted, this way we only try and reclaim what
4108                          * we need.
4109                          */
4110                         num_bytes = orig_bytes;
4111                 }
4112         } else {
4113                 /*
4114                  * Ok we're over committed, set num_bytes to the overcommitted
4115                  * amount plus the amount of bytes that we need for this
4116                  * reservation.
4117                  */
4118                 num_bytes = used - space_info->total_bytes +
4119                         (orig_bytes * 2);
4120         }
4121
4122         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4123                 space_info->bytes_may_use += orig_bytes;
4124                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4125                                               space_info->flags, orig_bytes,
4126                                               1);
4127                 ret = 0;
4128         }
4129
4130         /*
4131          * Couldn't make our reservation, save our place so while we're trying
4132          * to reclaim space we can actually use it instead of somebody else
4133          * stealing it from us.
4134          *
4135          * We make the other tasks wait for the flush only when we can flush
4136          * all things.
4137          */
4138         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4139                 flushing = true;
4140                 space_info->flush = 1;
4141         }
4142
4143         spin_unlock(&space_info->lock);
4144
4145         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4146                 goto out;
4147
4148         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4149                           flush_state);
4150         flush_state++;
4151
4152         /*
4153          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4154          * would happen. So skip delalloc flush.
4155          */
4156         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4157             (flush_state == FLUSH_DELALLOC ||
4158              flush_state == FLUSH_DELALLOC_WAIT))
4159                 flush_state = ALLOC_CHUNK;
4160
4161         if (!ret)
4162                 goto again;
4163         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4164                  flush_state < COMMIT_TRANS)
4165                 goto again;
4166         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4167                  flush_state <= COMMIT_TRANS)
4168                 goto again;
4169
4170 out:
4171         if (ret == -ENOSPC &&
4172             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4173                 struct btrfs_block_rsv *global_rsv =
4174                         &root->fs_info->global_block_rsv;
4175
4176                 if (block_rsv != global_rsv &&
4177                     !block_rsv_use_bytes(global_rsv, orig_bytes))
4178                         ret = 0;
4179         }
4180         if (flushing) {
4181                 spin_lock(&space_info->lock);
4182                 space_info->flush = 0;
4183                 wake_up_all(&space_info->wait);
4184                 spin_unlock(&space_info->lock);
4185         }
4186         return ret;
4187 }
4188
4189 static struct btrfs_block_rsv *get_block_rsv(
4190                                         const struct btrfs_trans_handle *trans,
4191                                         const struct btrfs_root *root)
4192 {
4193         struct btrfs_block_rsv *block_rsv = NULL;
4194
4195         if (root->ref_cows)
4196                 block_rsv = trans->block_rsv;
4197
4198         if (root == root->fs_info->csum_root && trans->adding_csums)
4199                 block_rsv = trans->block_rsv;
4200
4201         if (!block_rsv)
4202                 block_rsv = root->block_rsv;
4203
4204         if (!block_rsv)
4205                 block_rsv = &root->fs_info->empty_block_rsv;
4206
4207         return block_rsv;
4208 }
4209
4210 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4211                                u64 num_bytes)
4212 {
4213         int ret = -ENOSPC;
4214         spin_lock(&block_rsv->lock);
4215         if (block_rsv->reserved >= num_bytes) {
4216                 block_rsv->reserved -= num_bytes;
4217                 if (block_rsv->reserved < block_rsv->size)
4218                         block_rsv->full = 0;
4219                 ret = 0;
4220         }
4221         spin_unlock(&block_rsv->lock);
4222         return ret;
4223 }
4224
4225 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4226                                 u64 num_bytes, int update_size)
4227 {
4228         spin_lock(&block_rsv->lock);
4229         block_rsv->reserved += num_bytes;
4230         if (update_size)
4231                 block_rsv->size += num_bytes;
4232         else if (block_rsv->reserved >= block_rsv->size)
4233                 block_rsv->full = 1;
4234         spin_unlock(&block_rsv->lock);
4235 }
4236
4237 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4238                                     struct btrfs_block_rsv *block_rsv,
4239                                     struct btrfs_block_rsv *dest, u64 num_bytes)
4240 {
4241         struct btrfs_space_info *space_info = block_rsv->space_info;
4242
4243         spin_lock(&block_rsv->lock);
4244         if (num_bytes == (u64)-1)
4245                 num_bytes = block_rsv->size;
4246         block_rsv->size -= num_bytes;
4247         if (block_rsv->reserved >= block_rsv->size) {
4248                 num_bytes = block_rsv->reserved - block_rsv->size;
4249                 block_rsv->reserved = block_rsv->size;
4250                 block_rsv->full = 1;
4251         } else {
4252                 num_bytes = 0;
4253         }
4254         spin_unlock(&block_rsv->lock);
4255
4256         if (num_bytes > 0) {
4257                 if (dest) {
4258                         spin_lock(&dest->lock);
4259                         if (!dest->full) {
4260                                 u64 bytes_to_add;
4261
4262                                 bytes_to_add = dest->size - dest->reserved;
4263                                 bytes_to_add = min(num_bytes, bytes_to_add);
4264                                 dest->reserved += bytes_to_add;
4265                                 if (dest->reserved >= dest->size)
4266                                         dest->full = 1;
4267                                 num_bytes -= bytes_to_add;
4268                         }
4269                         spin_unlock(&dest->lock);
4270                 }
4271                 if (num_bytes) {
4272                         spin_lock(&space_info->lock);
4273                         space_info->bytes_may_use -= num_bytes;
4274                         trace_btrfs_space_reservation(fs_info, "space_info",
4275                                         space_info->flags, num_bytes, 0);
4276                         space_info->reservation_progress++;
4277                         spin_unlock(&space_info->lock);
4278                 }
4279         }
4280 }
4281
4282 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4283                                    struct btrfs_block_rsv *dst, u64 num_bytes)
4284 {
4285         int ret;
4286
4287         ret = block_rsv_use_bytes(src, num_bytes);
4288         if (ret)
4289                 return ret;
4290
4291         block_rsv_add_bytes(dst, num_bytes, 1);
4292         return 0;
4293 }
4294
4295 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4296 {
4297         memset(rsv, 0, sizeof(*rsv));
4298         spin_lock_init(&rsv->lock);
4299         rsv->type = type;
4300 }
4301
4302 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4303                                               unsigned short type)
4304 {
4305         struct btrfs_block_rsv *block_rsv;
4306         struct btrfs_fs_info *fs_info = root->fs_info;
4307
4308         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4309         if (!block_rsv)
4310                 return NULL;
4311
4312         btrfs_init_block_rsv(block_rsv, type);
4313         block_rsv->space_info = __find_space_info(fs_info,
4314                                                   BTRFS_BLOCK_GROUP_METADATA);
4315         return block_rsv;
4316 }
4317
4318 void btrfs_free_block_rsv(struct btrfs_root *root,
4319                           struct btrfs_block_rsv *rsv)
4320 {
4321         if (!rsv)
4322                 return;
4323         btrfs_block_rsv_release(root, rsv, (u64)-1);
4324         kfree(rsv);
4325 }
4326
4327 int btrfs_block_rsv_add(struct btrfs_root *root,
4328                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4329                         enum btrfs_reserve_flush_enum flush)
4330 {
4331         int ret;
4332
4333         if (num_bytes == 0)
4334                 return 0;
4335
4336         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4337         if (!ret) {
4338                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
4339                 return 0;
4340         }
4341
4342         return ret;
4343 }
4344
4345 int btrfs_block_rsv_check(struct btrfs_root *root,
4346                           struct btrfs_block_rsv *block_rsv, int min_factor)
4347 {
4348         u64 num_bytes = 0;
4349         int ret = -ENOSPC;
4350
4351         if (!block_rsv)
4352                 return 0;
4353
4354         spin_lock(&block_rsv->lock);
4355         num_bytes = div_factor(block_rsv->size, min_factor);
4356         if (block_rsv->reserved >= num_bytes)
4357                 ret = 0;
4358         spin_unlock(&block_rsv->lock);
4359
4360         return ret;
4361 }
4362
4363 int btrfs_block_rsv_refill(struct btrfs_root *root,
4364                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4365                            enum btrfs_reserve_flush_enum flush)
4366 {
4367         u64 num_bytes = 0;
4368         int ret = -ENOSPC;
4369
4370         if (!block_rsv)
4371                 return 0;
4372
4373         spin_lock(&block_rsv->lock);
4374         num_bytes = min_reserved;
4375         if (block_rsv->reserved >= num_bytes)
4376                 ret = 0;
4377         else
4378                 num_bytes -= block_rsv->reserved;
4379         spin_unlock(&block_rsv->lock);
4380
4381         if (!ret)
4382                 return 0;
4383
4384         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4385         if (!ret) {
4386                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
4387                 return 0;
4388         }
4389
4390         return ret;
4391 }
4392
4393 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4394                             struct btrfs_block_rsv *dst_rsv,
4395                             u64 num_bytes)
4396 {
4397         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4398 }
4399
4400 void btrfs_block_rsv_release(struct btrfs_root *root,
4401                              struct btrfs_block_rsv *block_rsv,
4402                              u64 num_bytes)
4403 {
4404         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4405         if (global_rsv->full || global_rsv == block_rsv ||
4406             block_rsv->space_info != global_rsv->space_info)
4407                 global_rsv = NULL;
4408         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4409                                 num_bytes);
4410 }
4411
4412 /*
4413  * helper to calculate size of global block reservation.
4414  * the desired value is sum of space used by extent tree,
4415  * checksum tree and root tree
4416  */
4417 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4418 {
4419         struct btrfs_space_info *sinfo;
4420         u64 num_bytes;
4421         u64 meta_used;
4422         u64 data_used;
4423         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4424
4425         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4426         spin_lock(&sinfo->lock);
4427         data_used = sinfo->bytes_used;
4428         spin_unlock(&sinfo->lock);
4429
4430         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4431         spin_lock(&sinfo->lock);
4432         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4433                 data_used = 0;
4434         meta_used = sinfo->bytes_used;
4435         spin_unlock(&sinfo->lock);
4436
4437         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4438                     csum_size * 2;
4439         num_bytes += div64_u64(data_used + meta_used, 50);
4440
4441         if (num_bytes * 3 > meta_used)
4442                 num_bytes = div64_u64(meta_used, 3);
4443
4444         return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4445 }
4446
4447 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4448 {
4449         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4450         struct btrfs_space_info *sinfo = block_rsv->space_info;
4451         u64 num_bytes;
4452
4453         num_bytes = calc_global_metadata_size(fs_info);
4454
4455         spin_lock(&sinfo->lock);
4456         spin_lock(&block_rsv->lock);
4457
4458         block_rsv->size = num_bytes;
4459
4460         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4461                     sinfo->bytes_reserved + sinfo->bytes_readonly +
4462                     sinfo->bytes_may_use;
4463
4464         if (sinfo->total_bytes > num_bytes) {
4465                 num_bytes = sinfo->total_bytes - num_bytes;
4466                 block_rsv->reserved += num_bytes;
4467                 sinfo->bytes_may_use += num_bytes;
4468                 trace_btrfs_space_reservation(fs_info, "space_info",
4469                                       sinfo->flags, num_bytes, 1);
4470         }
4471
4472         if (block_rsv->reserved >= block_rsv->size) {
4473                 num_bytes = block_rsv->reserved - block_rsv->size;
4474                 sinfo->bytes_may_use -= num_bytes;
4475                 trace_btrfs_space_reservation(fs_info, "space_info",
4476                                       sinfo->flags, num_bytes, 0);
4477                 sinfo->reservation_progress++;
4478                 block_rsv->reserved = block_rsv->size;
4479                 block_rsv->full = 1;
4480         }
4481
4482         spin_unlock(&block_rsv->lock);
4483         spin_unlock(&sinfo->lock);
4484 }
4485
4486 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4487 {
4488         struct btrfs_space_info *space_info;
4489
4490         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4491         fs_info->chunk_block_rsv.space_info = space_info;
4492
4493         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4494         fs_info->global_block_rsv.space_info = space_info;
4495         fs_info->delalloc_block_rsv.space_info = space_info;
4496         fs_info->trans_block_rsv.space_info = space_info;
4497         fs_info->empty_block_rsv.space_info = space_info;
4498         fs_info->delayed_block_rsv.space_info = space_info;
4499
4500         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4501         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4502         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4503         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4504         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4505
4506         update_global_block_rsv(fs_info);
4507 }
4508
4509 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4510 {
4511         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4512                                 (u64)-1);
4513         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4514         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4515         WARN_ON(fs_info->trans_block_rsv.size > 0);
4516         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4517         WARN_ON(fs_info->chunk_block_rsv.size > 0);
4518         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4519         WARN_ON(fs_info->delayed_block_rsv.size > 0);
4520         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4521 }
4522
4523 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4524                                   struct btrfs_root *root)
4525 {
4526         if (!trans->block_rsv)
4527                 return;
4528
4529         if (!trans->bytes_reserved)
4530                 return;
4531
4532         trace_btrfs_space_reservation(root->fs_info, "transaction",
4533                                       trans->transid, trans->bytes_reserved, 0);
4534         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4535         trans->bytes_reserved = 0;
4536 }
4537
4538 /* Can only return 0 or -ENOSPC */
4539 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4540                                   struct inode *inode)
4541 {
4542         struct btrfs_root *root = BTRFS_I(inode)->root;
4543         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4544         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4545
4546         /*
4547          * We need to hold space in order to delete our orphan item once we've
4548          * added it, so this takes the reservation so we can release it later
4549          * when we are truly done with the orphan item.
4550          */
4551         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4552         trace_btrfs_space_reservation(root->fs_info, "orphan",
4553                                       btrfs_ino(inode), num_bytes, 1);
4554         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4555 }
4556
4557 void btrfs_orphan_release_metadata(struct inode *inode)
4558 {
4559         struct btrfs_root *root = BTRFS_I(inode)->root;
4560         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4561         trace_btrfs_space_reservation(root->fs_info, "orphan",
4562                                       btrfs_ino(inode), num_bytes, 0);
4563         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4564 }
4565
4566 /*
4567  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4568  * root: the root of the parent directory
4569  * rsv: block reservation
4570  * items: the number of items that we need do reservation
4571  * qgroup_reserved: used to return the reserved size in qgroup
4572  *
4573  * This function is used to reserve the space for snapshot/subvolume
4574  * creation and deletion. Those operations are different with the
4575  * common file/directory operations, they change two fs/file trees
4576  * and root tree, the number of items that the qgroup reserves is
4577  * different with the free space reservation. So we can not use
4578  * the space reseravtion mechanism in start_transaction().
4579  */
4580 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4581                                      struct btrfs_block_rsv *rsv,
4582                                      int items,
4583                                      u64 *qgroup_reserved)
4584 {
4585         u64 num_bytes;
4586         int ret;
4587
4588         if (root->fs_info->quota_enabled) {
4589                 /* One for parent inode, two for dir entries */
4590                 num_bytes = 3 * root->leafsize;
4591                 ret = btrfs_qgroup_reserve(root, num_bytes);
4592                 if (ret)
4593                         return ret;
4594         } else {
4595                 num_bytes = 0;
4596         }
4597
4598         *qgroup_reserved = num_bytes;
4599
4600         num_bytes = btrfs_calc_trans_metadata_size(root, items);
4601         rsv->space_info = __find_space_info(root->fs_info,
4602                                             BTRFS_BLOCK_GROUP_METADATA);
4603         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4604                                   BTRFS_RESERVE_FLUSH_ALL);
4605         if (ret) {
4606                 if (*qgroup_reserved)
4607                         btrfs_qgroup_free(root, *qgroup_reserved);
4608         }
4609
4610         return ret;
4611 }
4612
4613 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4614                                       struct btrfs_block_rsv *rsv,
4615                                       u64 qgroup_reserved)
4616 {
4617         btrfs_block_rsv_release(root, rsv, (u64)-1);
4618         if (qgroup_reserved)
4619                 btrfs_qgroup_free(root, qgroup_reserved);
4620 }
4621
4622 /**
4623  * drop_outstanding_extent - drop an outstanding extent
4624  * @inode: the inode we're dropping the extent for
4625  *
4626  * This is called when we are freeing up an outstanding extent, either called
4627  * after an error or after an extent is written.  This will return the number of
4628  * reserved extents that need to be freed.  This must be called with
4629  * BTRFS_I(inode)->lock held.
4630  */
4631 static unsigned drop_outstanding_extent(struct inode *inode)
4632 {
4633         unsigned drop_inode_space = 0;
4634         unsigned dropped_extents = 0;
4635
4636         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4637         BTRFS_I(inode)->outstanding_extents--;
4638
4639         if (BTRFS_I(inode)->outstanding_extents == 0 &&
4640             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4641                                &BTRFS_I(inode)->runtime_flags))
4642                 drop_inode_space = 1;
4643
4644         /*
4645          * If we have more or the same amount of outsanding extents than we have
4646          * reserved then we need to leave the reserved extents count alone.
4647          */
4648         if (BTRFS_I(inode)->outstanding_extents >=
4649             BTRFS_I(inode)->reserved_extents)
4650                 return drop_inode_space;
4651
4652         dropped_extents = BTRFS_I(inode)->reserved_extents -
4653                 BTRFS_I(inode)->outstanding_extents;
4654         BTRFS_I(inode)->reserved_extents -= dropped_extents;
4655         return dropped_extents + drop_inode_space;
4656 }
4657
4658 /**
4659  * calc_csum_metadata_size - return the amount of metada space that must be
4660  *      reserved/free'd for the given bytes.
4661  * @inode: the inode we're manipulating
4662  * @num_bytes: the number of bytes in question
4663  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4664  *
4665  * This adjusts the number of csum_bytes in the inode and then returns the
4666  * correct amount of metadata that must either be reserved or freed.  We
4667  * calculate how many checksums we can fit into one leaf and then divide the
4668  * number of bytes that will need to be checksumed by this value to figure out
4669  * how many checksums will be required.  If we are adding bytes then the number
4670  * may go up and we will return the number of additional bytes that must be
4671  * reserved.  If it is going down we will return the number of bytes that must
4672  * be freed.
4673  *
4674  * This must be called with BTRFS_I(inode)->lock held.
4675  */
4676 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4677                                    int reserve)
4678 {
4679         struct btrfs_root *root = BTRFS_I(inode)->root;
4680         u64 csum_size;
4681         int num_csums_per_leaf;
4682         int num_csums;
4683         int old_csums;
4684
4685         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4686             BTRFS_I(inode)->csum_bytes == 0)
4687                 return 0;
4688
4689         old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4690         if (reserve)
4691                 BTRFS_I(inode)->csum_bytes += num_bytes;
4692         else
4693                 BTRFS_I(inode)->csum_bytes -= num_bytes;
4694         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4695         num_csums_per_leaf = (int)div64_u64(csum_size,
4696                                             sizeof(struct btrfs_csum_item) +
4697                                             sizeof(struct btrfs_disk_key));
4698         num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4699         num_csums = num_csums + num_csums_per_leaf - 1;
4700         num_csums = num_csums / num_csums_per_leaf;
4701
4702         old_csums = old_csums + num_csums_per_leaf - 1;
4703         old_csums = old_csums / num_csums_per_leaf;
4704
4705         /* No change, no need to reserve more */
4706         if (old_csums == num_csums)
4707                 return 0;
4708
4709         if (reserve)
4710                 return btrfs_calc_trans_metadata_size(root,
4711                                                       num_csums - old_csums);
4712
4713         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4714 }
4715
4716 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4717 {
4718         struct btrfs_root *root = BTRFS_I(inode)->root;
4719         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4720         u64 to_reserve = 0;
4721         u64 csum_bytes;
4722         unsigned nr_extents = 0;
4723         int extra_reserve = 0;
4724         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4725         int ret = 0;
4726         bool delalloc_lock = true;
4727         u64 to_free = 0;
4728         unsigned dropped;
4729
4730         /* If we are a free space inode we need to not flush since we will be in
4731          * the middle of a transaction commit.  We also don't need the delalloc
4732          * mutex since we won't race with anybody.  We need this mostly to make
4733          * lockdep shut its filthy mouth.
4734          */
4735         if (btrfs_is_free_space_inode(inode)) {
4736                 flush = BTRFS_RESERVE_NO_FLUSH;
4737                 delalloc_lock = false;
4738         }
4739
4740         if (flush != BTRFS_RESERVE_NO_FLUSH &&
4741             btrfs_transaction_in_commit(root->fs_info))
4742                 schedule_timeout(1);
4743
4744         if (delalloc_lock)
4745                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4746
4747         num_bytes = ALIGN(num_bytes, root->sectorsize);
4748
4749         spin_lock(&BTRFS_I(inode)->lock);
4750         BTRFS_I(inode)->outstanding_extents++;
4751
4752         if (BTRFS_I(inode)->outstanding_extents >
4753             BTRFS_I(inode)->reserved_extents)
4754                 nr_extents = BTRFS_I(inode)->outstanding_extents -
4755                         BTRFS_I(inode)->reserved_extents;
4756
4757         /*
4758          * Add an item to reserve for updating the inode when we complete the
4759          * delalloc io.
4760          */
4761         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4762                       &BTRFS_I(inode)->runtime_flags)) {
4763                 nr_extents++;
4764                 extra_reserve = 1;
4765         }
4766
4767         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4768         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4769         csum_bytes = BTRFS_I(inode)->csum_bytes;
4770         spin_unlock(&BTRFS_I(inode)->lock);
4771
4772         if (root->fs_info->quota_enabled) {
4773                 ret = btrfs_qgroup_reserve(root, num_bytes +
4774                                            nr_extents * root->leafsize);
4775                 if (ret)
4776                         goto out_fail;
4777         }
4778
4779         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4780         if (unlikely(ret)) {
4781                 if (root->fs_info->quota_enabled)
4782                         btrfs_qgroup_free(root, num_bytes +
4783                                                 nr_extents * root->leafsize);
4784                 goto out_fail;
4785         }
4786
4787         spin_lock(&BTRFS_I(inode)->lock);
4788         if (extra_reserve) {
4789                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4790                         &BTRFS_I(inode)->runtime_flags);
4791                 nr_extents--;
4792         }
4793         BTRFS_I(inode)->reserved_extents += nr_extents;
4794         spin_unlock(&BTRFS_I(inode)->lock);
4795
4796         if (delalloc_lock)
4797                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4798
4799         if (to_reserve)
4800                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4801                                               btrfs_ino(inode), to_reserve, 1);
4802         block_rsv_add_bytes(block_rsv, to_reserve, 1);
4803
4804         return 0;
4805
4806 out_fail:
4807         spin_lock(&BTRFS_I(inode)->lock);
4808         dropped = drop_outstanding_extent(inode);
4809         /*
4810          * If the inodes csum_bytes is the same as the original
4811          * csum_bytes then we know we haven't raced with any free()ers
4812          * so we can just reduce our inodes csum bytes and carry on.
4813          * Otherwise we have to do the normal free thing to account for
4814          * the case that the free side didn't free up its reserve
4815          * because of this outstanding reservation.
4816          */
4817         if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4818                 calc_csum_metadata_size(inode, num_bytes, 0);
4819         else
4820                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4821         spin_unlock(&BTRFS_I(inode)->lock);
4822         if (dropped)
4823                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4824
4825         if (to_free) {
4826                 btrfs_block_rsv_release(root, block_rsv, to_free);
4827                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4828                                               btrfs_ino(inode), to_free, 0);
4829         }
4830         if (delalloc_lock)
4831                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4832         return ret;
4833 }
4834
4835 /**
4836  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4837  * @inode: the inode to release the reservation for
4838  * @num_bytes: the number of bytes we're releasing
4839  *
4840  * This will release the metadata reservation for an inode.  This can be called
4841  * once we complete IO for a given set of bytes to release their metadata
4842  * reservations.
4843  */
4844 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4845 {
4846         struct btrfs_root *root = BTRFS_I(inode)->root;
4847         u64 to_free = 0;
4848         unsigned dropped;
4849
4850         num_bytes = ALIGN(num_bytes, root->sectorsize);
4851         spin_lock(&BTRFS_I(inode)->lock);
4852         dropped = drop_outstanding_extent(inode);
4853
4854         if (num_bytes)
4855                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4856         spin_unlock(&BTRFS_I(inode)->lock);
4857         if (dropped > 0)
4858                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4859
4860         trace_btrfs_space_reservation(root->fs_info, "delalloc",
4861                                       btrfs_ino(inode), to_free, 0);
4862         if (root->fs_info->quota_enabled) {
4863                 btrfs_qgroup_free(root, num_bytes +
4864                                         dropped * root->leafsize);
4865         }
4866
4867         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4868                                 to_free);
4869 }
4870
4871 /**
4872  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4873  * @inode: inode we're writing to
4874  * @num_bytes: the number of bytes we want to allocate
4875  *
4876  * This will do the following things
4877  *
4878  * o reserve space in the data space info for num_bytes
4879  * o reserve space in the metadata space info based on number of outstanding
4880  *   extents and how much csums will be needed
4881  * o add to the inodes ->delalloc_bytes
4882  * o add it to the fs_info's delalloc inodes list.
4883  *
4884  * This will return 0 for success and -ENOSPC if there is no space left.
4885  */
4886 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4887 {
4888         int ret;
4889
4890         ret = btrfs_check_data_free_space(inode, num_bytes);
4891         if (ret)
4892                 return ret;
4893
4894         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4895         if (ret) {
4896                 btrfs_free_reserved_data_space(inode, num_bytes);
4897                 return ret;
4898         }
4899
4900         return 0;
4901 }
4902
4903 /**
4904  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4905  * @inode: inode we're releasing space for
4906  * @num_bytes: the number of bytes we want to free up
4907  *
4908  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4909  * called in the case that we don't need the metadata AND data reservations
4910  * anymore.  So if there is an error or we insert an inline extent.
4911  *
4912  * This function will release the metadata space that was not used and will
4913  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4914  * list if there are no delalloc bytes left.
4915  */
4916 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4917 {
4918         btrfs_delalloc_release_metadata(inode, num_bytes);
4919         btrfs_free_reserved_data_space(inode, num_bytes);
4920 }
4921
4922 static int update_block_group(struct btrfs_root *root,
4923                               u64 bytenr, u64 num_bytes, int alloc)
4924 {
4925         struct btrfs_block_group_cache *cache = NULL;
4926         struct btrfs_fs_info *info = root->fs_info;
4927         u64 total = num_bytes;
4928         u64 old_val;
4929         u64 byte_in_group;
4930         int factor;
4931
4932         /* block accounting for super block */
4933         spin_lock(&info->delalloc_lock);
4934         old_val = btrfs_super_bytes_used(info->super_copy);
4935         if (alloc)
4936                 old_val += num_bytes;
4937         else
4938                 old_val -= num_bytes;
4939         btrfs_set_super_bytes_used(info->super_copy, old_val);
4940         spin_unlock(&info->delalloc_lock);
4941
4942         while (total) {
4943                 cache = btrfs_lookup_block_group(info, bytenr);
4944                 if (!cache)
4945                         return -ENOENT;
4946                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4947                                     BTRFS_BLOCK_GROUP_RAID1 |
4948                                     BTRFS_BLOCK_GROUP_RAID10))
4949                         factor = 2;
4950                 else
4951                         factor = 1;
4952                 /*
4953                  * If this block group has free space cache written out, we
4954                  * need to make sure to load it if we are removing space.  This
4955                  * is because we need the unpinning stage to actually add the
4956                  * space back to the block group, otherwise we will leak space.
4957                  */
4958                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4959                         cache_block_group(cache, 1);
4960
4961                 byte_in_group = bytenr - cache->key.objectid;
4962                 WARN_ON(byte_in_group > cache->key.offset);
4963
4964                 spin_lock(&cache->space_info->lock);
4965                 spin_lock(&cache->lock);
4966
4967                 if (btrfs_test_opt(root, SPACE_CACHE) &&
4968                     cache->disk_cache_state < BTRFS_DC_CLEAR)
4969                         cache->disk_cache_state = BTRFS_DC_CLEAR;
4970
4971                 cache->dirty = 1;
4972                 old_val = btrfs_block_group_used(&cache->item);
4973                 num_bytes = min(total, cache->key.offset - byte_in_group);
4974                 if (alloc) {
4975                         old_val += num_bytes;
4976                         btrfs_set_block_group_used(&cache->item, old_val);
4977                         cache->reserved -= num_bytes;
4978                         cache->space_info->bytes_reserved -= num_bytes;
4979                         cache->space_info->bytes_used += num_bytes;
4980                         cache->space_info->disk_used += num_bytes * factor;
4981                         spin_unlock(&cache->lock);
4982                         spin_unlock(&cache->space_info->lock);
4983                 } else {
4984                         old_val -= num_bytes;
4985                         btrfs_set_block_group_used(&cache->item, old_val);
4986                         cache->pinned += num_bytes;
4987                         cache->space_info->bytes_pinned += num_bytes;
4988                         cache->space_info->bytes_used -= num_bytes;
4989                         cache->space_info->disk_used -= num_bytes * factor;
4990                         spin_unlock(&cache->lock);
4991                         spin_unlock(&cache->space_info->lock);
4992
4993                         set_extent_dirty(info->pinned_extents,
4994                                          bytenr, bytenr + num_bytes - 1,
4995                                          GFP_NOFS | __GFP_NOFAIL);
4996                 }
4997                 btrfs_put_block_group(cache);
4998                 total -= num_bytes;
4999                 bytenr += num_bytes;
5000         }
5001         return 0;
5002 }
5003
5004 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5005 {
5006         struct btrfs_block_group_cache *cache;
5007         u64 bytenr;
5008
5009         spin_lock(&root->fs_info->block_group_cache_lock);
5010         bytenr = root->fs_info->first_logical_byte;
5011         spin_unlock(&root->fs_info->block_group_cache_lock);
5012
5013         if (bytenr < (u64)-1)
5014                 return bytenr;
5015
5016         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5017         if (!cache)
5018                 return 0;
5019
5020         bytenr = cache->key.objectid;
5021         btrfs_put_block_group(cache);
5022
5023         return bytenr;
5024 }
5025
5026 static int pin_down_extent(struct btrfs_root *root,
5027                            struct btrfs_block_group_cache *cache,
5028                            u64 bytenr, u64 num_bytes, int reserved)
5029 {
5030         spin_lock(&cache->space_info->lock);
5031         spin_lock(&cache->lock);
5032         cache->pinned += num_bytes;
5033         cache->space_info->bytes_pinned += num_bytes;
5034         if (reserved) {
5035                 cache->reserved -= num_bytes;
5036                 cache->space_info->bytes_reserved -= num_bytes;
5037         }
5038         spin_unlock(&cache->lock);
5039         spin_unlock(&cache->space_info->lock);
5040
5041         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5042                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5043         return 0;
5044 }
5045
5046 /*
5047  * this function must be called within transaction
5048  */
5049 int btrfs_pin_extent(struct btrfs_root *root,
5050                      u64 bytenr, u64 num_bytes, int reserved)
5051 {
5052         struct btrfs_block_group_cache *cache;
5053
5054         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5055         BUG_ON(!cache); /* Logic error */
5056
5057         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5058
5059         btrfs_put_block_group(cache);
5060         return 0;
5061 }
5062
5063 /*
5064  * this function must be called within transaction
5065  */
5066 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5067                                     u64 bytenr, u64 num_bytes)
5068 {
5069         struct btrfs_block_group_cache *cache;
5070
5071         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5072         BUG_ON(!cache); /* Logic error */
5073
5074         /*
5075          * pull in the free space cache (if any) so that our pin
5076          * removes the free space from the cache.  We have load_only set
5077          * to one because the slow code to read in the free extents does check
5078          * the pinned extents.
5079          */
5080         cache_block_group(cache, 1);
5081
5082         pin_down_extent(root, cache, bytenr, num_bytes, 0);
5083
5084         /* remove us from the free space cache (if we're there at all) */
5085         btrfs_remove_free_space(cache, bytenr, num_bytes);
5086         btrfs_put_block_group(cache);
5087         return 0;
5088 }
5089
5090 /**
5091  * btrfs_update_reserved_bytes - update the block_group and space info counters
5092  * @cache:      The cache we are manipulating
5093  * @num_bytes:  The number of bytes in question
5094  * @reserve:    One of the reservation enums
5095  *
5096  * This is called by the allocator when it reserves space, or by somebody who is
5097  * freeing space that was never actually used on disk.  For example if you
5098  * reserve some space for a new leaf in transaction A and before transaction A
5099  * commits you free that leaf, you call this with reserve set to 0 in order to
5100  * clear the reservation.
5101  *
5102  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5103  * ENOSPC accounting.  For data we handle the reservation through clearing the
5104  * delalloc bits in the io_tree.  We have to do this since we could end up
5105  * allocating less disk space for the amount of data we have reserved in the
5106  * case of compression.
5107  *
5108  * If this is a reservation and the block group has become read only we cannot
5109  * make the reservation and return -EAGAIN, otherwise this function always
5110  * succeeds.
5111  */
5112 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5113                                        u64 num_bytes, int reserve)
5114 {
5115         struct btrfs_space_info *space_info = cache->space_info;
5116         int ret = 0;
5117
5118         spin_lock(&space_info->lock);
5119         spin_lock(&cache->lock);
5120         if (reserve != RESERVE_FREE) {
5121                 if (cache->ro) {
5122                         ret = -EAGAIN;
5123                 } else {
5124                         cache->reserved += num_bytes;
5125                         space_info->bytes_reserved += num_bytes;
5126                         if (reserve == RESERVE_ALLOC) {
5127                                 trace_btrfs_space_reservation(cache->fs_info,
5128                                                 "space_info", space_info->flags,
5129                                                 num_bytes, 0);
5130                                 space_info->bytes_may_use -= num_bytes;
5131                         }
5132                 }
5133         } else {
5134                 if (cache->ro)
5135                         space_info->bytes_readonly += num_bytes;
5136                 cache->reserved -= num_bytes;
5137                 space_info->bytes_reserved -= num_bytes;
5138                 space_info->reservation_progress++;
5139         }
5140         spin_unlock(&cache->lock);
5141         spin_unlock(&space_info->lock);
5142         return ret;
5143 }
5144
5145 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5146                                 struct btrfs_root *root)
5147 {
5148         struct btrfs_fs_info *fs_info = root->fs_info;
5149         struct btrfs_caching_control *next;
5150         struct btrfs_caching_control *caching_ctl;
5151         struct btrfs_block_group_cache *cache;
5152
5153         down_write(&fs_info->extent_commit_sem);
5154
5155         list_for_each_entry_safe(caching_ctl, next,
5156                                  &fs_info->caching_block_groups, list) {
5157                 cache = caching_ctl->block_group;
5158                 if (block_group_cache_done(cache)) {
5159                         cache->last_byte_to_unpin = (u64)-1;
5160                         list_del_init(&caching_ctl->list);
5161                         put_caching_control(caching_ctl);
5162                 } else {
5163                         cache->last_byte_to_unpin = caching_ctl->progress;
5164                 }
5165         }
5166
5167         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5168                 fs_info->pinned_extents = &fs_info->freed_extents[1];
5169         else
5170                 fs_info->pinned_extents = &fs_info->freed_extents[0];
5171
5172         up_write(&fs_info->extent_commit_sem);
5173
5174         update_global_block_rsv(fs_info);
5175 }
5176
5177 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5178 {
5179         struct btrfs_fs_info *fs_info = root->fs_info;
5180         struct btrfs_block_group_cache *cache = NULL;
5181         struct btrfs_space_info *space_info;
5182         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5183         u64 len;
5184         bool readonly;
5185
5186         while (start <= end) {
5187                 readonly = false;
5188                 if (!cache ||
5189                     start >= cache->key.objectid + cache->key.offset) {
5190                         if (cache)
5191                                 btrfs_put_block_group(cache);
5192                         cache = btrfs_lookup_block_group(fs_info, start);
5193                         BUG_ON(!cache); /* Logic error */
5194                 }
5195
5196                 len = cache->key.objectid + cache->key.offset - start;
5197                 len = min(len, end + 1 - start);
5198
5199                 if (start < cache->last_byte_to_unpin) {
5200                         len = min(len, cache->last_byte_to_unpin - start);
5201                         btrfs_add_free_space(cache, start, len);
5202                 }
5203
5204                 start += len;
5205                 space_info = cache->space_info;
5206
5207                 spin_lock(&space_info->lock);
5208                 spin_lock(&cache->lock);
5209                 cache->pinned -= len;
5210                 space_info->bytes_pinned -= len;
5211                 if (cache->ro) {
5212                         space_info->bytes_readonly += len;
5213                         readonly = true;
5214                 }
5215                 spin_unlock(&cache->lock);
5216                 if (!readonly && global_rsv->space_info == space_info) {
5217                         spin_lock(&global_rsv->lock);
5218                         if (!global_rsv->full) {
5219                                 len = min(len, global_rsv->size -
5220                                           global_rsv->reserved);
5221                                 global_rsv->reserved += len;
5222                                 space_info->bytes_may_use += len;
5223                                 if (global_rsv->reserved >= global_rsv->size)
5224                                         global_rsv->full = 1;
5225                         }
5226                         spin_unlock(&global_rsv->lock);
5227                 }
5228                 spin_unlock(&space_info->lock);
5229         }
5230
5231         if (cache)
5232                 btrfs_put_block_group(cache);
5233         return 0;
5234 }
5235
5236 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5237                                struct btrfs_root *root)
5238 {
5239         struct btrfs_fs_info *fs_info = root->fs_info;
5240         struct extent_io_tree *unpin;
5241         u64 start;
5242         u64 end;
5243         int ret;
5244
5245         if (trans->aborted)
5246                 return 0;
5247
5248         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5249                 unpin = &fs_info->freed_extents[1];
5250         else
5251                 unpin = &fs_info->freed_extents[0];
5252
5253         while (1) {
5254                 ret = find_first_extent_bit(unpin, 0, &start, &end,
5255                                             EXTENT_DIRTY, NULL);
5256                 if (ret)
5257                         break;
5258
5259                 if (btrfs_test_opt(root, DISCARD))
5260                         ret = btrfs_discard_extent(root, start,
5261                                                    end + 1 - start, NULL);
5262
5263                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
5264                 unpin_extent_range(root, start, end);
5265                 cond_resched();
5266         }
5267
5268         return 0;
5269 }
5270
5271 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5272                                 struct btrfs_root *root,
5273                                 u64 bytenr, u64 num_bytes, u64 parent,
5274                                 u64 root_objectid, u64 owner_objectid,
5275                                 u64 owner_offset, int refs_to_drop,
5276                                 struct btrfs_delayed_extent_op *extent_op)
5277 {
5278         struct btrfs_key key;
5279         struct btrfs_path *path;
5280         struct btrfs_fs_info *info = root->fs_info;
5281         struct btrfs_root *extent_root = info->extent_root;
5282         struct extent_buffer *leaf;
5283         struct btrfs_extent_item *ei;
5284         struct btrfs_extent_inline_ref *iref;
5285         int ret;
5286         int is_data;
5287         int extent_slot = 0;
5288         int found_extent = 0;
5289         int num_to_del = 1;
5290         u32 item_size;
5291         u64 refs;
5292
5293         path = btrfs_alloc_path();
5294         if (!path)
5295                 return -ENOMEM;
5296
5297         path->reada = 1;
5298         path->leave_spinning = 1;
5299
5300         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5301         BUG_ON(!is_data && refs_to_drop != 1);
5302
5303         ret = lookup_extent_backref(trans, extent_root, path, &iref,
5304                                     bytenr, num_bytes, parent,
5305                                     root_objectid, owner_objectid,
5306                                     owner_offset);
5307         if (ret == 0) {
5308                 extent_slot = path->slots[0];
5309                 while (extent_slot >= 0) {
5310                         btrfs_item_key_to_cpu(path->nodes[0], &key,
5311                                               extent_slot);
5312                         if (key.objectid != bytenr)
5313                                 break;
5314                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5315                             key.offset == num_bytes) {
5316                                 found_extent = 1;
5317                                 break;
5318                         }
5319                         if (path->slots[0] - extent_slot > 5)
5320                                 break;
5321                         extent_slot--;
5322                 }
5323 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5324                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5325                 if (found_extent && item_size < sizeof(*ei))
5326                         found_extent = 0;
5327 #endif
5328                 if (!found_extent) {
5329                         BUG_ON(iref);
5330                         ret = remove_extent_backref(trans, extent_root, path,
5331                                                     NULL, refs_to_drop,
5332                                                     is_data);
5333                         if (ret) {
5334                                 btrfs_abort_transaction(trans, extent_root, ret);
5335                                 goto out;
5336                         }
5337                         btrfs_release_path(path);
5338                         path->leave_spinning = 1;
5339
5340                         key.objectid = bytenr;
5341                         key.type = BTRFS_EXTENT_ITEM_KEY;
5342                         key.offset = num_bytes;
5343
5344                         ret = btrfs_search_slot(trans, extent_root,
5345                                                 &key, path, -1, 1);
5346                         if (ret) {
5347                                 printk(KERN_ERR "umm, got %d back from search"
5348                                        ", was looking for %llu\n", ret,
5349                                        (unsigned long long)bytenr);
5350                                 if (ret > 0)
5351                                         btrfs_print_leaf(extent_root,
5352                                                          path->nodes[0]);
5353                         }
5354                         if (ret < 0) {
5355                                 btrfs_abort_transaction(trans, extent_root, ret);
5356                                 goto out;
5357                         }
5358                         extent_slot = path->slots[0];
5359                 }
5360         } else if (ret == -ENOENT) {
5361                 btrfs_print_leaf(extent_root, path->nodes[0]);
5362                 WARN_ON(1);
5363                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5364                        "parent %llu root %llu  owner %llu offset %llu\n",
5365                        (unsigned long long)bytenr,
5366                        (unsigned long long)parent,
5367                        (unsigned long long)root_objectid,
5368                        (unsigned long long)owner_objectid,
5369                        (unsigned long long)owner_offset);
5370         } else {
5371                 btrfs_abort_transaction(trans, extent_root, ret);
5372                 goto out;
5373         }
5374
5375         leaf = path->nodes[0];
5376         item_size = btrfs_item_size_nr(leaf, extent_slot);
5377 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5378         if (item_size < sizeof(*ei)) {
5379                 BUG_ON(found_extent || extent_slot != path->slots[0]);
5380                 ret = convert_extent_item_v0(trans, extent_root, path,
5381                                              owner_objectid, 0);
5382                 if (ret < 0) {
5383                         btrfs_abort_transaction(trans, extent_root, ret);
5384                         goto out;
5385                 }
5386
5387                 btrfs_release_path(path);
5388                 path->leave_spinning = 1;
5389
5390                 key.objectid = bytenr;
5391                 key.type = BTRFS_EXTENT_ITEM_KEY;
5392                 key.offset = num_bytes;
5393
5394                 ret = btrfs_search_slot(trans, extent_root, &key, path,
5395                                         -1, 1);
5396                 if (ret) {
5397                         printk(KERN_ERR "umm, got %d back from search"
5398                                ", was looking for %llu\n", ret,
5399                                (unsigned long long)bytenr);
5400                         btrfs_print_leaf(extent_root, path->nodes[0]);
5401                 }
5402                 if (ret < 0) {
5403                         btrfs_abort_transaction(trans, extent_root, ret);
5404                         goto out;
5405                 }
5406
5407                 extent_slot = path->slots[0];
5408                 leaf = path->nodes[0];
5409                 item_size = btrfs_item_size_nr(leaf, extent_slot);
5410         }
5411 #endif
5412         BUG_ON(item_size < sizeof(*ei));
5413         ei = btrfs_item_ptr(leaf, extent_slot,
5414                             struct btrfs_extent_item);
5415         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5416                 struct btrfs_tree_block_info *bi;
5417                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5418                 bi = (struct btrfs_tree_block_info *)(ei + 1);
5419                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5420         }
5421
5422         refs = btrfs_extent_refs(leaf, ei);
5423         BUG_ON(refs < refs_to_drop);
5424         refs -= refs_to_drop;
5425
5426         if (refs > 0) {
5427                 if (extent_op)
5428                         __run_delayed_extent_op(extent_op, leaf, ei);
5429                 /*
5430                  * In the case of inline back ref, reference count will
5431                  * be updated by remove_extent_backref
5432                  */
5433                 if (iref) {
5434                         BUG_ON(!found_extent);
5435                 } else {
5436                         btrfs_set_extent_refs(leaf, ei, refs);
5437                         btrfs_mark_buffer_dirty(leaf);
5438                 }
5439                 if (found_extent) {
5440                         ret = remove_extent_backref(trans, extent_root, path,
5441                                                     iref, refs_to_drop,
5442                                                     is_data);
5443                         if (ret) {
5444                                 btrfs_abort_transaction(trans, extent_root, ret);
5445                                 goto out;
5446                         }
5447                 }
5448         } else {
5449                 if (found_extent) {
5450                         BUG_ON(is_data && refs_to_drop !=
5451                                extent_data_ref_count(root, path, iref));
5452                         if (iref) {
5453                                 BUG_ON(path->slots[0] != extent_slot);
5454                         } else {
5455                                 BUG_ON(path->slots[0] != extent_slot + 1);
5456                                 path->slots[0] = extent_slot;
5457                                 num_to_del = 2;
5458                         }
5459                 }
5460
5461                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5462                                       num_to_del);
5463                 if (ret) {
5464                         btrfs_abort_transaction(trans, extent_root, ret);
5465                         goto out;
5466                 }
5467                 btrfs_release_path(path);
5468
5469                 if (is_data) {
5470                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5471                         if (ret) {
5472                                 btrfs_abort_transaction(trans, extent_root, ret);
5473                                 goto out;
5474                         }
5475                 }
5476
5477                 ret = update_block_group(root, bytenr, num_bytes, 0);
5478                 if (ret) {
5479                         btrfs_abort_transaction(trans, extent_root, ret);
5480                         goto out;
5481                 }
5482         }
5483 out:
5484         btrfs_free_path(path);
5485         return ret;
5486 }
5487
5488 /*
5489  * when we free an block, it is possible (and likely) that we free the last
5490  * delayed ref for that extent as well.  This searches the delayed ref tree for
5491  * a given extent, and if there are no other delayed refs to be processed, it
5492  * removes it from the tree.
5493  */
5494 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5495                                       struct btrfs_root *root, u64 bytenr)
5496 {
5497         struct btrfs_delayed_ref_head *head;
5498         struct btrfs_delayed_ref_root *delayed_refs;
5499         struct btrfs_delayed_ref_node *ref;
5500         struct rb_node *node;
5501         int ret = 0;
5502
5503         delayed_refs = &trans->transaction->delayed_refs;
5504         spin_lock(&delayed_refs->lock);
5505         head = btrfs_find_delayed_ref_head(trans, bytenr);
5506         if (!head)
5507                 goto out;
5508
5509         node = rb_prev(&head->node.rb_node);
5510         if (!node)
5511                 goto out;
5512
5513         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5514
5515         /* there are still entries for this ref, we can't drop it */
5516         if (ref->bytenr == bytenr)
5517                 goto out;
5518
5519         if (head->extent_op) {
5520                 if (!head->must_insert_reserved)
5521                         goto out;
5522                 btrfs_free_delayed_extent_op(head->extent_op);
5523                 head->extent_op = NULL;
5524         }
5525
5526         /*
5527          * waiting for the lock here would deadlock.  If someone else has it
5528          * locked they are already in the process of dropping it anyway
5529          */
5530         if (!mutex_trylock(&head->mutex))
5531                 goto out;
5532
5533         /*
5534          * at this point we have a head with no other entries.  Go
5535          * ahead and process it.
5536          */
5537         head->node.in_tree = 0;
5538         rb_erase(&head->node.rb_node, &delayed_refs->root);
5539
5540         delayed_refs->num_entries--;
5541
5542         /*
5543          * we don't take a ref on the node because we're removing it from the
5544          * tree, so we just steal the ref the tree was holding.
5545          */
5546         delayed_refs->num_heads--;
5547         if (list_empty(&head->cluster))
5548                 delayed_refs->num_heads_ready--;
5549
5550         list_del_init(&head->cluster);
5551         spin_unlock(&delayed_refs->lock);
5552
5553         BUG_ON(head->extent_op);
5554         if (head->must_insert_reserved)
5555                 ret = 1;
5556
5557         mutex_unlock(&head->mutex);
5558         btrfs_put_delayed_ref(&head->node);
5559         return ret;
5560 out:
5561         spin_unlock(&delayed_refs->lock);
5562         return 0;
5563 }
5564
5565 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5566                            struct btrfs_root *root,
5567                            struct extent_buffer *buf,
5568                            u64 parent, int last_ref)
5569 {
5570         struct btrfs_block_group_cache *cache = NULL;
5571         int ret;
5572
5573         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5574                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5575                                         buf->start, buf->len,
5576                                         parent, root->root_key.objectid,
5577                                         btrfs_header_level(buf),
5578                                         BTRFS_DROP_DELAYED_REF, NULL, 0);
5579                 BUG_ON(ret); /* -ENOMEM */
5580         }
5581
5582         if (!last_ref)
5583                 return;
5584
5585         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5586
5587         if (btrfs_header_generation(buf) == trans->transid) {
5588                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5589                         ret = check_ref_cleanup(trans, root, buf->start);
5590                         if (!ret)
5591                                 goto out;
5592                 }
5593
5594                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5595                         pin_down_extent(root, cache, buf->start, buf->len, 1);
5596                         goto out;
5597                 }
5598
5599                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5600
5601                 btrfs_add_free_space(cache, buf->start, buf->len);
5602                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5603         }
5604 out:
5605         /*
5606          * Deleting the buffer, clear the corrupt flag since it doesn't matter
5607          * anymore.
5608          */
5609         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5610         btrfs_put_block_group(cache);
5611 }
5612
5613 /* Can return -ENOMEM */
5614 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5615                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5616                       u64 owner, u64 offset, int for_cow)
5617 {
5618         int ret;
5619         struct btrfs_fs_info *fs_info = root->fs_info;
5620
5621         /*
5622          * tree log blocks never actually go into the extent allocation
5623          * tree, just update pinning info and exit early.
5624          */
5625         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5626                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5627                 /* unlocks the pinned mutex */
5628                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5629                 ret = 0;
5630         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5631                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5632                                         num_bytes,
5633                                         parent, root_objectid, (int)owner,
5634                                         BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5635         } else {
5636                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5637                                                 num_bytes,
5638                                                 parent, root_objectid, owner,
5639                                                 offset, BTRFS_DROP_DELAYED_REF,
5640                                                 NULL, for_cow);
5641         }
5642         return ret;
5643 }
5644
5645 static u64 stripe_align(struct btrfs_root *root,
5646                         struct btrfs_block_group_cache *cache,
5647                         u64 val, u64 num_bytes)
5648 {
5649         u64 ret = ALIGN(val, root->stripesize);
5650         return ret;
5651 }
5652
5653 /*
5654  * when we wait for progress in the block group caching, its because
5655  * our allocation attempt failed at least once.  So, we must sleep
5656  * and let some progress happen before we try again.
5657  *
5658  * This function will sleep at least once waiting for new free space to
5659  * show up, and then it will check the block group free space numbers
5660  * for our min num_bytes.  Another option is to have it go ahead
5661  * and look in the rbtree for a free extent of a given size, but this
5662  * is a good start.
5663  */
5664 static noinline int
5665 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5666                                 u64 num_bytes)
5667 {
5668         struct btrfs_caching_control *caching_ctl;
5669
5670         caching_ctl = get_caching_control(cache);
5671         if (!caching_ctl)
5672                 return 0;
5673
5674         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5675                    (cache->free_space_ctl->free_space >= num_bytes));
5676
5677         put_caching_control(caching_ctl);
5678         return 0;
5679 }
5680
5681 static noinline int
5682 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5683 {
5684         struct btrfs_caching_control *caching_ctl;
5685
5686         caching_ctl = get_caching_control(cache);
5687         if (!caching_ctl)
5688                 return 0;
5689
5690         wait_event(caching_ctl->wait, block_group_cache_done(cache));
5691
5692         put_caching_control(caching_ctl);
5693         return 0;
5694 }
5695
5696 int __get_raid_index(u64 flags)
5697 {
5698         if (flags & BTRFS_BLOCK_GROUP_RAID10)
5699                 return BTRFS_RAID_RAID10;
5700         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5701                 return BTRFS_RAID_RAID1;
5702         else if (flags & BTRFS_BLOCK_GROUP_DUP)
5703                 return BTRFS_RAID_DUP;
5704         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5705                 return BTRFS_RAID_RAID0;
5706         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5707                 return BTRFS_RAID_RAID5;
5708         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5709                 return BTRFS_RAID_RAID6;
5710
5711         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5712 }
5713
5714 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5715 {
5716         return __get_raid_index(cache->flags);
5717 }
5718
5719 enum btrfs_loop_type {
5720         LOOP_CACHING_NOWAIT = 0,
5721         LOOP_CACHING_WAIT = 1,
5722         LOOP_ALLOC_CHUNK = 2,
5723         LOOP_NO_EMPTY_SIZE = 3,
5724 };
5725
5726 /*
5727  * walks the btree of allocated extents and find a hole of a given size.
5728  * The key ins is changed to record the hole:
5729  * ins->objectid == block start
5730  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5731  * ins->offset == number of blocks
5732  * Any available blocks before search_start are skipped.
5733  */
5734 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5735                                      struct btrfs_root *orig_root,
5736                                      u64 num_bytes, u64 empty_size,
5737                                      u64 hint_byte, struct btrfs_key *ins,
5738                                      u64 data)
5739 {
5740         int ret = 0;
5741         struct btrfs_root *root = orig_root->fs_info->extent_root;
5742         struct btrfs_free_cluster *last_ptr = NULL;
5743         struct btrfs_block_group_cache *block_group = NULL;
5744         struct btrfs_block_group_cache *used_block_group;
5745         u64 search_start = 0;
5746         int empty_cluster = 2 * 1024 * 1024;
5747         struct btrfs_space_info *space_info;
5748         int loop = 0;
5749         int index = __get_raid_index(data);
5750         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5751                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5752         bool found_uncached_bg = false;
5753         bool failed_cluster_refill = false;
5754         bool failed_alloc = false;
5755         bool use_cluster = true;
5756         bool have_caching_bg = false;
5757
5758         WARN_ON(num_bytes < root->sectorsize);
5759         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5760         ins->objectid = 0;
5761         ins->offset = 0;
5762
5763         trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5764
5765         space_info = __find_space_info(root->fs_info, data);
5766         if (!space_info) {
5767                 printk(KERN_ERR "No space info for %llu\n", data);
5768                 return -ENOSPC;
5769         }
5770
5771         /*
5772          * If the space info is for both data and metadata it means we have a
5773          * small filesystem and we can't use the clustering stuff.
5774          */
5775         if (btrfs_mixed_space_info(space_info))
5776                 use_cluster = false;
5777
5778         if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5779                 last_ptr = &root->fs_info->meta_alloc_cluster;
5780                 if (!btrfs_test_opt(root, SSD))
5781                         empty_cluster = 64 * 1024;
5782         }
5783
5784         if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5785             btrfs_test_opt(root, SSD)) {
5786                 last_ptr = &root->fs_info->data_alloc_cluster;
5787         }
5788
5789         if (last_ptr) {
5790                 spin_lock(&last_ptr->lock);
5791                 if (last_ptr->block_group)
5792                         hint_byte = last_ptr->window_start;
5793                 spin_unlock(&last_ptr->lock);
5794         }
5795
5796         search_start = max(search_start, first_logical_byte(root, 0));
5797         search_start = max(search_start, hint_byte);
5798
5799         if (!last_ptr)
5800                 empty_cluster = 0;
5801
5802         if (search_start == hint_byte) {
5803                 block_group = btrfs_lookup_block_group(root->fs_info,
5804                                                        search_start);
5805                 used_block_group = block_group;
5806                 /*
5807                  * we don't want to use the block group if it doesn't match our
5808                  * allocation bits, or if its not cached.
5809                  *
5810                  * However if we are re-searching with an ideal block group
5811                  * picked out then we don't care that the block group is cached.
5812                  */
5813                 if (block_group && block_group_bits(block_group, data) &&
5814                     block_group->cached != BTRFS_CACHE_NO) {
5815                         down_read(&space_info->groups_sem);
5816                         if (list_empty(&block_group->list) ||
5817                             block_group->ro) {
5818                                 /*
5819                                  * someone is removing this block group,
5820                                  * we can't jump into the have_block_group
5821                                  * target because our list pointers are not
5822                                  * valid
5823                                  */
5824                                 btrfs_put_block_group(block_group);
5825                                 up_read(&space_info->groups_sem);
5826                         } else {
5827                                 index = get_block_group_index(block_group);
5828                                 goto have_block_group;
5829                         }
5830                 } else if (block_group) {
5831                         btrfs_put_block_group(block_group);
5832                 }
5833         }
5834 search:
5835         have_caching_bg = false;
5836         down_read(&space_info->groups_sem);
5837         list_for_each_entry(block_group, &space_info->block_groups[index],
5838                             list) {
5839                 u64 offset;
5840                 int cached;
5841
5842                 used_block_group = block_group;
5843                 btrfs_get_block_group(block_group);
5844                 search_start = block_group->key.objectid;
5845
5846                 /*
5847                  * this can happen if we end up cycling through all the
5848                  * raid types, but we want to make sure we only allocate
5849                  * for the proper type.
5850                  */
5851                 if (!block_group_bits(block_group, data)) {
5852                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
5853                                 BTRFS_BLOCK_GROUP_RAID1 |
5854                                 BTRFS_BLOCK_GROUP_RAID5 |
5855                                 BTRFS_BLOCK_GROUP_RAID6 |
5856                                 BTRFS_BLOCK_GROUP_RAID10;
5857
5858                         /*
5859                          * if they asked for extra copies and this block group
5860                          * doesn't provide them, bail.  This does allow us to
5861                          * fill raid0 from raid1.
5862                          */
5863                         if ((data & extra) && !(block_group->flags & extra))
5864                                 goto loop;
5865                 }
5866
5867 have_block_group:
5868                 cached = block_group_cache_done(block_group);
5869                 if (unlikely(!cached)) {
5870                         found_uncached_bg = true;
5871                         ret = cache_block_group(block_group, 0);
5872                         BUG_ON(ret < 0);
5873                         ret = 0;
5874                 }
5875
5876                 if (unlikely(block_group->ro))
5877                         goto loop;
5878
5879                 /*
5880                  * Ok we want to try and use the cluster allocator, so
5881                  * lets look there
5882                  */
5883                 if (last_ptr) {
5884                         unsigned long aligned_cluster;
5885                         /*
5886                          * the refill lock keeps out other
5887                          * people trying to start a new cluster
5888                          */
5889                         spin_lock(&last_ptr->refill_lock);
5890                         used_block_group = last_ptr->block_group;
5891                         if (used_block_group != block_group &&
5892                             (!used_block_group ||
5893                              used_block_group->ro ||
5894                              !block_group_bits(used_block_group, data))) {
5895                                 used_block_group = block_group;
5896                                 goto refill_cluster;
5897                         }
5898
5899                         if (used_block_group != block_group)
5900                                 btrfs_get_block_group(used_block_group);
5901
5902                         offset = btrfs_alloc_from_cluster(used_block_group,
5903                           last_ptr, num_bytes, used_block_group->key.objectid);
5904                         if (offset) {
5905                                 /* we have a block, we're done */
5906                                 spin_unlock(&last_ptr->refill_lock);
5907                                 trace_btrfs_reserve_extent_cluster(root,
5908                                         block_group, search_start, num_bytes);
5909                                 goto checks;
5910                         }
5911
5912                         WARN_ON(last_ptr->block_group != used_block_group);
5913                         if (used_block_group != block_group) {
5914                                 btrfs_put_block_group(used_block_group);
5915                                 used_block_group = block_group;
5916                         }
5917 refill_cluster:
5918                         BUG_ON(used_block_group != block_group);
5919                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5920                          * set up a new clusters, so lets just skip it
5921                          * and let the allocator find whatever block
5922                          * it can find.  If we reach this point, we
5923                          * will have tried the cluster allocator
5924                          * plenty of times and not have found
5925                          * anything, so we are likely way too
5926                          * fragmented for the clustering stuff to find
5927                          * anything.
5928                          *
5929                          * However, if the cluster is taken from the
5930                          * current block group, release the cluster
5931                          * first, so that we stand a better chance of
5932                          * succeeding in the unclustered
5933                          * allocation.  */
5934                         if (loop >= LOOP_NO_EMPTY_SIZE &&
5935                             last_ptr->block_group != block_group) {
5936                                 spin_unlock(&last_ptr->refill_lock);
5937                                 goto unclustered_alloc;
5938                         }
5939
5940                         /*
5941                          * this cluster didn't work out, free it and
5942                          * start over
5943                          */
5944                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5945
5946                         if (loop >= LOOP_NO_EMPTY_SIZE) {
5947                                 spin_unlock(&last_ptr->refill_lock);
5948                                 goto unclustered_alloc;
5949                         }
5950
5951                         aligned_cluster = max_t(unsigned long,
5952                                                 empty_cluster + empty_size,
5953                                               block_group->full_stripe_len);
5954
5955                         /* allocate a cluster in this block group */
5956                         ret = btrfs_find_space_cluster(trans, root,
5957                                                block_group, last_ptr,
5958                                                search_start, num_bytes,
5959                                                aligned_cluster);
5960                         if (ret == 0) {
5961                                 /*
5962                                  * now pull our allocation out of this
5963                                  * cluster
5964                                  */
5965                                 offset = btrfs_alloc_from_cluster(block_group,
5966                                                   last_ptr, num_bytes,
5967                                                   search_start);
5968                                 if (offset) {
5969                                         /* we found one, proceed */
5970                                         spin_unlock(&last_ptr->refill_lock);
5971                                         trace_btrfs_reserve_extent_cluster(root,
5972                                                 block_group, search_start,
5973                                                 num_bytes);
5974                                         goto checks;
5975                                 }
5976                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
5977                                    && !failed_cluster_refill) {
5978                                 spin_unlock(&last_ptr->refill_lock);
5979
5980                                 failed_cluster_refill = true;
5981                                 wait_block_group_cache_progress(block_group,
5982                                        num_bytes + empty_cluster + empty_size);
5983                                 goto have_block_group;
5984                         }
5985
5986                         /*
5987                          * at this point we either didn't find a cluster
5988                          * or we weren't able to allocate a block from our
5989                          * cluster.  Free the cluster we've been trying
5990                          * to use, and go to the next block group
5991                          */
5992                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5993                         spin_unlock(&last_ptr->refill_lock);
5994                         goto loop;
5995                 }
5996
5997 unclustered_alloc:
5998                 spin_lock(&block_group->free_space_ctl->tree_lock);
5999                 if (cached &&
6000                     block_group->free_space_ctl->free_space <
6001                     num_bytes + empty_cluster + empty_size) {
6002                         spin_unlock(&block_group->free_space_ctl->tree_lock);
6003                         goto loop;
6004                 }
6005                 spin_unlock(&block_group->free_space_ctl->tree_lock);
6006
6007                 offset = btrfs_find_space_for_alloc(block_group, search_start,
6008                                                     num_bytes, empty_size);
6009                 /*
6010                  * If we didn't find a chunk, and we haven't failed on this
6011                  * block group before, and this block group is in the middle of
6012                  * caching and we are ok with waiting, then go ahead and wait
6013                  * for progress to be made, and set failed_alloc to true.
6014                  *
6015                  * If failed_alloc is true then we've already waited on this
6016                  * block group once and should move on to the next block group.
6017                  */
6018                 if (!offset && !failed_alloc && !cached &&
6019                     loop > LOOP_CACHING_NOWAIT) {
6020                         wait_block_group_cache_progress(block_group,
6021                                                 num_bytes + empty_size);
6022                         failed_alloc = true;
6023                         goto have_block_group;
6024                 } else if (!offset) {
6025                         if (!cached)
6026                                 have_caching_bg = true;
6027                         goto loop;
6028                 }
6029 checks:
6030                 search_start = stripe_align(root, used_block_group,
6031                                             offset, num_bytes);
6032
6033                 /* move on to the next group */
6034                 if (search_start + num_bytes >
6035                     used_block_group->key.objectid + used_block_group->key.offset) {
6036                         btrfs_add_free_space(used_block_group, offset, num_bytes);
6037                         goto loop;
6038                 }
6039
6040                 if (offset < search_start)
6041                         btrfs_add_free_space(used_block_group, offset,
6042                                              search_start - offset);
6043                 BUG_ON(offset > search_start);
6044
6045                 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
6046                                                   alloc_type);
6047                 if (ret == -EAGAIN) {
6048                         btrfs_add_free_space(used_block_group, offset, num_bytes);
6049                         goto loop;
6050                 }
6051
6052                 /* we are all good, lets return */
6053                 ins->objectid = search_start;
6054                 ins->offset = num_bytes;
6055
6056                 trace_btrfs_reserve_extent(orig_root, block_group,
6057                                            search_start, num_bytes);
6058                 if (used_block_group != block_group)
6059                         btrfs_put_block_group(used_block_group);
6060                 btrfs_put_block_group(block_group);
6061                 break;
6062 loop:
6063                 failed_cluster_refill = false;
6064                 failed_alloc = false;
6065                 BUG_ON(index != get_block_group_index(block_group));
6066                 if (used_block_group != block_group)
6067                         btrfs_put_block_group(used_block_group);
6068                 btrfs_put_block_group(block_group);
6069         }
6070         up_read(&space_info->groups_sem);
6071
6072         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
6073                 goto search;
6074
6075         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
6076                 goto search;
6077
6078         /*
6079          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6080          *                      caching kthreads as we move along
6081          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6082          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6083          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6084          *                      again
6085          */
6086         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
6087                 index = 0;
6088                 loop++;
6089                 if (loop == LOOP_ALLOC_CHUNK) {
6090                         ret = do_chunk_alloc(trans, root, data,
6091                                              CHUNK_ALLOC_FORCE);
6092                         /*
6093                          * Do not bail out on ENOSPC since we
6094                          * can do more things.
6095                          */
6096                         if (ret < 0 && ret != -ENOSPC) {
6097                                 btrfs_abort_transaction(trans,
6098                                                         root, ret);
6099                                 goto out;
6100                         }
6101                 }
6102
6103                 if (loop == LOOP_NO_EMPTY_SIZE) {
6104                         empty_size = 0;
6105                         empty_cluster = 0;
6106                 }
6107
6108                 goto search;
6109         } else if (!ins->objectid) {
6110                 ret = -ENOSPC;
6111         } else if (ins->objectid) {
6112                 ret = 0;
6113         }
6114 out:
6115
6116         return ret;
6117 }
6118
6119 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6120                             int dump_block_groups)
6121 {
6122         struct btrfs_block_group_cache *cache;
6123         int index = 0;
6124
6125         spin_lock(&info->lock);
6126         printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
6127                (unsigned long long)info->flags,
6128                (unsigned long long)(info->total_bytes - info->bytes_used -
6129                                     info->bytes_pinned - info->bytes_reserved -
6130                                     info->bytes_readonly),
6131                (info->full) ? "" : "not ");
6132         printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
6133                "reserved=%llu, may_use=%llu, readonly=%llu\n",
6134                (unsigned long long)info->total_bytes,
6135                (unsigned long long)info->bytes_used,
6136                (unsigned long long)info->bytes_pinned,
6137                (unsigned long long)info->bytes_reserved,
6138                (unsigned long long)info->bytes_may_use,
6139                (unsigned long long)info->bytes_readonly);
6140         spin_unlock(&info->lock);
6141
6142         if (!dump_block_groups)
6143                 return;
6144
6145         down_read(&info->groups_sem);
6146 again:
6147         list_for_each_entry(cache, &info->block_groups[index], list) {
6148                 spin_lock(&cache->lock);
6149                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
6150                        (unsigned long long)cache->key.objectid,
6151                        (unsigned long long)cache->key.offset,
6152                        (unsigned long long)btrfs_block_group_used(&cache->item),
6153                        (unsigned long long)cache->pinned,
6154                        (unsigned long long)cache->reserved,
6155                        cache->ro ? "[readonly]" : "");
6156                 btrfs_dump_free_space(cache, bytes);
6157                 spin_unlock(&cache->lock);
6158         }
6159         if (++index < BTRFS_NR_RAID_TYPES)
6160                 goto again;
6161         up_read(&info->groups_sem);
6162 }
6163
6164 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
6165                          struct btrfs_root *root,
6166                          u64 num_bytes, u64 min_alloc_size,
6167                          u64 empty_size, u64 hint_byte,
6168                          struct btrfs_key *ins, u64 data)
6169 {
6170         bool final_tried = false;
6171         int ret;
6172
6173         data = btrfs_get_alloc_profile(root, data);
6174 again:
6175         WARN_ON(num_bytes < root->sectorsize);
6176         ret = find_free_extent(trans, root, num_bytes, empty_size,
6177                                hint_byte, ins, data);
6178
6179         if (ret == -ENOSPC) {
6180                 if (!final_tried) {
6181                         num_bytes = num_bytes >> 1;
6182                         num_bytes = round_down(num_bytes, root->sectorsize);
6183                         num_bytes = max(num_bytes, min_alloc_size);
6184                         if (num_bytes == min_alloc_size)
6185                                 final_tried = true;
6186                         goto again;
6187                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6188                         struct btrfs_space_info *sinfo;
6189
6190                         sinfo = __find_space_info(root->fs_info, data);
6191                         printk(KERN_ERR "btrfs allocation failed flags %llu, "
6192                                "wanted %llu\n", (unsigned long long)data,
6193                                (unsigned long long)num_bytes);
6194                         if (sinfo)
6195                                 dump_space_info(sinfo, num_bytes, 1);
6196                 }
6197         }
6198
6199         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6200
6201         return ret;
6202 }
6203
6204 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6205                                         u64 start, u64 len, int pin)
6206 {
6207         struct btrfs_block_group_cache *cache;
6208         int ret = 0;
6209
6210         cache = btrfs_lookup_block_group(root->fs_info, start);
6211         if (!cache) {
6212                 printk(KERN_ERR "Unable to find block group for %llu\n",
6213                        (unsigned long long)start);
6214                 return -ENOSPC;
6215         }
6216
6217         if (btrfs_test_opt(root, DISCARD))
6218                 ret = btrfs_discard_extent(root, start, len, NULL);
6219
6220         if (pin)
6221                 pin_down_extent(root, cache, start, len, 1);
6222         else {
6223                 btrfs_add_free_space(cache, start, len);
6224                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6225         }
6226         btrfs_put_block_group(cache);
6227
6228         trace_btrfs_reserved_extent_free(root, start, len);
6229
6230         return ret;
6231 }
6232
6233 int btrfs_free_reserved_extent(struct btrfs_root *root,
6234                                         u64 start, u64 len)
6235 {
6236         return __btrfs_free_reserved_extent(root, start, len, 0);
6237 }
6238
6239 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6240                                        u64 start, u64 len)
6241 {
6242         return __btrfs_free_reserved_extent(root, start, len, 1);
6243 }
6244
6245 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6246                                       struct btrfs_root *root,
6247                                       u64 parent, u64 root_objectid,
6248                                       u64 flags, u64 owner, u64 offset,
6249                                       struct btrfs_key *ins, int ref_mod)
6250 {
6251         int ret;
6252         struct btrfs_fs_info *fs_info = root->fs_info;
6253         struct btrfs_extent_item *extent_item;
6254         struct btrfs_extent_inline_ref *iref;
6255         struct btrfs_path *path;
6256         struct extent_buffer *leaf;
6257         int type;
6258         u32 size;
6259
6260         if (parent > 0)
6261                 type = BTRFS_SHARED_DATA_REF_KEY;
6262         else
6263                 type = BTRFS_EXTENT_DATA_REF_KEY;
6264
6265         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6266
6267         path = btrfs_alloc_path();
6268         if (!path)
6269                 return -ENOMEM;
6270
6271         path->leave_spinning = 1;
6272         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6273                                       ins, size);
6274         if (ret) {
6275                 btrfs_free_path(path);
6276                 return ret;
6277         }
6278
6279         leaf = path->nodes[0];
6280         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6281                                      struct btrfs_extent_item);
6282         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6283         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6284         btrfs_set_extent_flags(leaf, extent_item,
6285                                flags | BTRFS_EXTENT_FLAG_DATA);
6286
6287         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6288         btrfs_set_extent_inline_ref_type(leaf, iref, type);
6289         if (parent > 0) {
6290                 struct btrfs_shared_data_ref *ref;
6291                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6292                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6293                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6294         } else {
6295                 struct btrfs_extent_data_ref *ref;
6296                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6297                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6298                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6299                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6300                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6301         }
6302
6303         btrfs_mark_buffer_dirty(path->nodes[0]);
6304         btrfs_free_path(path);
6305
6306         ret = update_block_group(root, ins->objectid, ins->offset, 1);
6307         if (ret) { /* -ENOENT, logic error */
6308                 printk(KERN_ERR "btrfs update block group failed for %llu "
6309                        "%llu\n", (unsigned long long)ins->objectid,
6310                        (unsigned long long)ins->offset);
6311                 BUG();
6312         }
6313         return ret;
6314 }
6315
6316 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6317                                      struct btrfs_root *root,
6318                                      u64 parent, u64 root_objectid,
6319                                      u64 flags, struct btrfs_disk_key *key,
6320                                      int level, struct btrfs_key *ins)
6321 {
6322         int ret;
6323         struct btrfs_fs_info *fs_info = root->fs_info;
6324         struct btrfs_extent_item *extent_item;
6325         struct btrfs_tree_block_info *block_info;
6326         struct btrfs_extent_inline_ref *iref;
6327         struct btrfs_path *path;
6328         struct extent_buffer *leaf;
6329         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6330
6331         path = btrfs_alloc_path();
6332         if (!path)
6333                 return -ENOMEM;
6334
6335         path->leave_spinning = 1;
6336         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6337                                       ins, size);
6338         if (ret) {
6339                 btrfs_free_path(path);
6340                 return ret;
6341         }
6342
6343         leaf = path->nodes[0];
6344         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6345                                      struct btrfs_extent_item);
6346         btrfs_set_extent_refs(leaf, extent_item, 1);
6347         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6348         btrfs_set_extent_flags(leaf, extent_item,
6349                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6350         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6351
6352         btrfs_set_tree_block_key(leaf, block_info, key);
6353         btrfs_set_tree_block_level(leaf, block_info, level);
6354
6355         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6356         if (parent > 0) {
6357                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6358                 btrfs_set_extent_inline_ref_type(leaf, iref,
6359                                                  BTRFS_SHARED_BLOCK_REF_KEY);
6360                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6361         } else {
6362                 btrfs_set_extent_inline_ref_type(leaf, iref,
6363                                                  BTRFS_TREE_BLOCK_REF_KEY);
6364                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6365         }
6366
6367         btrfs_mark_buffer_dirty(leaf);
6368         btrfs_free_path(path);
6369
6370         ret = update_block_group(root, ins->objectid, ins->offset, 1);
6371         if (ret) { /* -ENOENT, logic error */
6372                 printk(KERN_ERR "btrfs update block group failed for %llu "
6373                        "%llu\n", (unsigned long long)ins->objectid,
6374                        (unsigned long long)ins->offset);
6375                 BUG();
6376         }
6377         return ret;
6378 }
6379
6380 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6381                                      struct btrfs_root *root,
6382                                      u64 root_objectid, u64 owner,
6383                                      u64 offset, struct btrfs_key *ins)
6384 {
6385         int ret;
6386
6387         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6388
6389         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6390                                          ins->offset, 0,
6391                                          root_objectid, owner, offset,
6392                                          BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6393         return ret;
6394 }
6395
6396 /*
6397  * this is used by the tree logging recovery code.  It records that
6398  * an extent has been allocated and makes sure to clear the free
6399  * space cache bits as well
6400  */
6401 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6402                                    struct btrfs_root *root,
6403                                    u64 root_objectid, u64 owner, u64 offset,
6404                                    struct btrfs_key *ins)
6405 {
6406         int ret;
6407         struct btrfs_block_group_cache *block_group;
6408         struct btrfs_caching_control *caching_ctl;
6409         u64 start = ins->objectid;
6410         u64 num_bytes = ins->offset;
6411
6412         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6413         cache_block_group(block_group, 0);
6414         caching_ctl = get_caching_control(block_group);
6415
6416         if (!caching_ctl) {
6417                 BUG_ON(!block_group_cache_done(block_group));
6418                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6419                 BUG_ON(ret); /* -ENOMEM */
6420         } else {
6421                 mutex_lock(&caching_ctl->mutex);
6422
6423                 if (start >= caching_ctl->progress) {
6424                         ret = add_excluded_extent(root, start, num_bytes);
6425                         BUG_ON(ret); /* -ENOMEM */
6426                 } else if (start + num_bytes <= caching_ctl->progress) {
6427                         ret = btrfs_remove_free_space(block_group,
6428                                                       start, num_bytes);
6429                         BUG_ON(ret); /* -ENOMEM */
6430                 } else {
6431                         num_bytes = caching_ctl->progress - start;
6432                         ret = btrfs_remove_free_space(block_group,
6433                                                       start, num_bytes);
6434                         BUG_ON(ret); /* -ENOMEM */
6435
6436                         start = caching_ctl->progress;
6437                         num_bytes = ins->objectid + ins->offset -
6438                                     caching_ctl->progress;
6439                         ret = add_excluded_extent(root, start, num_bytes);
6440                         BUG_ON(ret); /* -ENOMEM */
6441                 }
6442
6443                 mutex_unlock(&caching_ctl->mutex);
6444                 put_caching_control(caching_ctl);
6445         }
6446
6447         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6448                                           RESERVE_ALLOC_NO_ACCOUNT);
6449         BUG_ON(ret); /* logic error */
6450         btrfs_put_block_group(block_group);
6451         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6452                                          0, owner, offset, ins, 1);
6453         return ret;
6454 }
6455
6456 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6457                                             struct btrfs_root *root,
6458                                             u64 bytenr, u32 blocksize,
6459                                             int level)
6460 {
6461         struct extent_buffer *buf;
6462
6463         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6464         if (!buf)
6465                 return ERR_PTR(-ENOMEM);
6466         btrfs_set_header_generation(buf, trans->transid);
6467         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6468         btrfs_tree_lock(buf);
6469         clean_tree_block(trans, root, buf);
6470         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6471
6472         btrfs_set_lock_blocking(buf);
6473         btrfs_set_buffer_uptodate(buf);
6474
6475         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6476                 /*
6477                  * we allow two log transactions at a time, use different
6478                  * EXENT bit to differentiate dirty pages.
6479                  */
6480                 if (root->log_transid % 2 == 0)
6481                         set_extent_dirty(&root->dirty_log_pages, buf->start,
6482                                         buf->start + buf->len - 1, GFP_NOFS);
6483                 else
6484                         set_extent_new(&root->dirty_log_pages, buf->start,
6485                                         buf->start + buf->len - 1, GFP_NOFS);
6486         } else {
6487                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6488                          buf->start + buf->len - 1, GFP_NOFS);
6489         }
6490         trans->blocks_used++;
6491         /* this returns a buffer locked for blocking */
6492         return buf;
6493 }
6494
6495 static struct btrfs_block_rsv *
6496 use_block_rsv(struct btrfs_trans_handle *trans,
6497               struct btrfs_root *root, u32 blocksize)
6498 {
6499         struct btrfs_block_rsv *block_rsv;
6500         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6501         int ret;
6502
6503         block_rsv = get_block_rsv(trans, root);
6504
6505         if (block_rsv->size == 0) {
6506                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6507                                              BTRFS_RESERVE_NO_FLUSH);
6508                 /*
6509                  * If we couldn't reserve metadata bytes try and use some from
6510                  * the global reserve.
6511                  */
6512                 if (ret && block_rsv != global_rsv) {
6513                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6514                         if (!ret)
6515                                 return global_rsv;
6516                         return ERR_PTR(ret);
6517                 } else if (ret) {
6518                         return ERR_PTR(ret);
6519                 }
6520                 return block_rsv;
6521         }
6522
6523         ret = block_rsv_use_bytes(block_rsv, blocksize);
6524         if (!ret)
6525                 return block_rsv;
6526         if (ret && !block_rsv->failfast) {
6527                 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6528                         static DEFINE_RATELIMIT_STATE(_rs,
6529                                         DEFAULT_RATELIMIT_INTERVAL * 10,
6530                                         /*DEFAULT_RATELIMIT_BURST*/ 1);
6531                         if (__ratelimit(&_rs))
6532                                 WARN(1, KERN_DEBUG
6533                                         "btrfs: block rsv returned %d\n", ret);
6534                 }
6535                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6536                                              BTRFS_RESERVE_NO_FLUSH);
6537                 if (!ret) {
6538                         return block_rsv;
6539                 } else if (ret && block_rsv != global_rsv) {
6540                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6541                         if (!ret)
6542                                 return global_rsv;
6543                 }
6544         }
6545
6546         return ERR_PTR(-ENOSPC);
6547 }
6548
6549 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6550                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
6551 {
6552         block_rsv_add_bytes(block_rsv, blocksize, 0);
6553         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6554 }
6555
6556 /*
6557  * finds a free extent and does all the dirty work required for allocation
6558  * returns the key for the extent through ins, and a tree buffer for
6559  * the first block of the extent through buf.
6560  *
6561  * returns the tree buffer or NULL.
6562  */
6563 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6564                                         struct btrfs_root *root, u32 blocksize,
6565                                         u64 parent, u64 root_objectid,
6566                                         struct btrfs_disk_key *key, int level,
6567                                         u64 hint, u64 empty_size)
6568 {
6569         struct btrfs_key ins;
6570         struct btrfs_block_rsv *block_rsv;
6571         struct extent_buffer *buf;
6572         u64 flags = 0;
6573         int ret;
6574
6575
6576         block_rsv = use_block_rsv(trans, root, blocksize);
6577         if (IS_ERR(block_rsv))
6578                 return ERR_CAST(block_rsv);
6579
6580         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6581                                    empty_size, hint, &ins, 0);
6582         if (ret) {
6583                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6584                 return ERR_PTR(ret);
6585         }
6586
6587         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6588                                     blocksize, level);
6589         BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6590
6591         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6592                 if (parent == 0)
6593                         parent = ins.objectid;
6594                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6595         } else
6596                 BUG_ON(parent > 0);
6597
6598         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6599                 struct btrfs_delayed_extent_op *extent_op;
6600                 extent_op = btrfs_alloc_delayed_extent_op();
6601                 BUG_ON(!extent_op); /* -ENOMEM */
6602                 if (key)
6603                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
6604                 else
6605                         memset(&extent_op->key, 0, sizeof(extent_op->key));
6606                 extent_op->flags_to_set = flags;
6607                 extent_op->update_key = 1;
6608                 extent_op->update_flags = 1;
6609                 extent_op->is_data = 0;
6610
6611                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6612                                         ins.objectid,
6613                                         ins.offset, parent, root_objectid,
6614                                         level, BTRFS_ADD_DELAYED_EXTENT,
6615                                         extent_op, 0);
6616                 BUG_ON(ret); /* -ENOMEM */
6617         }
6618         return buf;
6619 }
6620
6621 struct walk_control {
6622         u64 refs[BTRFS_MAX_LEVEL];
6623         u64 flags[BTRFS_MAX_LEVEL];
6624         struct btrfs_key update_progress;
6625         int stage;
6626         int level;
6627         int shared_level;
6628         int update_ref;
6629         int keep_locks;
6630         int reada_slot;
6631         int reada_count;
6632         int for_reloc;
6633 };
6634
6635 #define DROP_REFERENCE  1
6636 #define UPDATE_BACKREF  2
6637
6638 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6639                                      struct btrfs_root *root,
6640                                      struct walk_control *wc,
6641                                      struct btrfs_path *path)
6642 {
6643         u64 bytenr;
6644         u64 generation;
6645         u64 refs;
6646         u64 flags;
6647         u32 nritems;
6648         u32 blocksize;
6649         struct btrfs_key key;
6650         struct extent_buffer *eb;
6651         int ret;
6652         int slot;
6653         int nread = 0;
6654
6655         if (path->slots[wc->level] < wc->reada_slot) {
6656                 wc->reada_count = wc->reada_count * 2 / 3;
6657                 wc->reada_count = max(wc->reada_count, 2);
6658         } else {
6659                 wc->reada_count = wc->reada_count * 3 / 2;
6660                 wc->reada_count = min_t(int, wc->reada_count,
6661                                         BTRFS_NODEPTRS_PER_BLOCK(root));
6662         }
6663
6664         eb = path->nodes[wc->level];
6665         nritems = btrfs_header_nritems(eb);
6666         blocksize = btrfs_level_size(root, wc->level - 1);
6667
6668         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6669                 if (nread >= wc->reada_count)
6670                         break;
6671
6672                 cond_resched();
6673                 bytenr = btrfs_node_blockptr(eb, slot);
6674                 generation = btrfs_node_ptr_generation(eb, slot);
6675
6676                 if (slot == path->slots[wc->level])
6677                         goto reada;
6678
6679                 if (wc->stage == UPDATE_BACKREF &&
6680                     generation <= root->root_key.offset)
6681                         continue;
6682
6683                 /* We don't lock the tree block, it's OK to be racy here */
6684                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6685                                                &refs, &flags);
6686                 /* We don't care about errors in readahead. */
6687                 if (ret < 0)
6688                         continue;
6689                 BUG_ON(refs == 0);
6690
6691                 if (wc->stage == DROP_REFERENCE) {
6692                         if (refs == 1)
6693                                 goto reada;
6694
6695                         if (wc->level == 1 &&
6696                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6697                                 continue;
6698                         if (!wc->update_ref ||
6699                             generation <= root->root_key.offset)
6700                                 continue;
6701                         btrfs_node_key_to_cpu(eb, &key, slot);
6702                         ret = btrfs_comp_cpu_keys(&key,
6703                                                   &wc->update_progress);
6704                         if (ret < 0)
6705                                 continue;
6706                 } else {
6707                         if (wc->level == 1 &&
6708                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6709                                 continue;
6710                 }
6711 reada:
6712                 ret = readahead_tree_block(root, bytenr, blocksize,
6713                                            generation);
6714                 if (ret)
6715                         break;
6716                 nread++;
6717         }
6718         wc->reada_slot = slot;
6719 }
6720
6721 /*
6722  * hepler to process tree block while walking down the tree.
6723  *
6724  * when wc->stage == UPDATE_BACKREF, this function updates
6725  * back refs for pointers in the block.
6726  *
6727  * NOTE: return value 1 means we should stop walking down.
6728  */
6729 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6730                                    struct btrfs_root *root,
6731                                    struct btrfs_path *path,
6732                                    struct walk_control *wc, int lookup_info)
6733 {
6734         int level = wc->level;
6735         struct extent_buffer *eb = path->nodes[level];
6736         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6737         int ret;
6738
6739         if (wc->stage == UPDATE_BACKREF &&
6740             btrfs_header_owner(eb) != root->root_key.objectid)
6741                 return 1;
6742
6743         /*
6744          * when reference count of tree block is 1, it won't increase
6745          * again. once full backref flag is set, we never clear it.
6746          */
6747         if (lookup_info &&
6748             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6749              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6750                 BUG_ON(!path->locks[level]);
6751                 ret = btrfs_lookup_extent_info(trans, root,
6752                                                eb->start, eb->len,
6753                                                &wc->refs[level],
6754                                                &wc->flags[level]);
6755                 BUG_ON(ret == -ENOMEM);
6756                 if (ret)
6757                         return ret;
6758                 BUG_ON(wc->refs[level] == 0);
6759         }
6760
6761         if (wc->stage == DROP_REFERENCE) {
6762                 if (wc->refs[level] > 1)
6763                         return 1;
6764
6765                 if (path->locks[level] && !wc->keep_locks) {
6766                         btrfs_tree_unlock_rw(eb, path->locks[level]);
6767                         path->locks[level] = 0;
6768                 }
6769                 return 0;
6770         }
6771
6772         /* wc->stage == UPDATE_BACKREF */
6773         if (!(wc->flags[level] & flag)) {
6774                 BUG_ON(!path->locks[level]);
6775                 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6776                 BUG_ON(ret); /* -ENOMEM */
6777                 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6778                 BUG_ON(ret); /* -ENOMEM */
6779                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6780                                                   eb->len, flag, 0);
6781                 BUG_ON(ret); /* -ENOMEM */
6782                 wc->flags[level] |= flag;
6783         }
6784
6785         /*
6786          * the block is shared by multiple trees, so it's not good to
6787          * keep the tree lock
6788          */
6789         if (path->locks[level] && level > 0) {
6790                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6791                 path->locks[level] = 0;
6792         }
6793         return 0;
6794 }
6795
6796 /*
6797  * hepler to process tree block pointer.
6798  *
6799  * when wc->stage == DROP_REFERENCE, this function checks
6800  * reference count of the block pointed to. if the block
6801  * is shared and we need update back refs for the subtree
6802  * rooted at the block, this function changes wc->stage to
6803  * UPDATE_BACKREF. if the block is shared and there is no
6804  * need to update back, this function drops the reference
6805  * to the block.
6806  *
6807  * NOTE: return value 1 means we should stop walking down.
6808  */
6809 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6810                                  struct btrfs_root *root,
6811                                  struct btrfs_path *path,
6812                                  struct walk_control *wc, int *lookup_info)
6813 {
6814         u64 bytenr;
6815         u64 generation;
6816         u64 parent;
6817         u32 blocksize;
6818         struct btrfs_key key;
6819         struct extent_buffer *next;
6820         int level = wc->level;
6821         int reada = 0;
6822         int ret = 0;
6823
6824         generation = btrfs_node_ptr_generation(path->nodes[level],
6825                                                path->slots[level]);
6826         /*
6827          * if the lower level block was created before the snapshot
6828          * was created, we know there is no need to update back refs
6829          * for the subtree
6830          */
6831         if (wc->stage == UPDATE_BACKREF &&
6832             generation <= root->root_key.offset) {
6833                 *lookup_info = 1;
6834                 return 1;
6835         }
6836
6837         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6838         blocksize = btrfs_level_size(root, level - 1);
6839
6840         next = btrfs_find_tree_block(root, bytenr, blocksize);
6841         if (!next) {
6842                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6843                 if (!next)
6844                         return -ENOMEM;
6845                 reada = 1;
6846         }
6847         btrfs_tree_lock(next);
6848         btrfs_set_lock_blocking(next);
6849
6850         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6851                                        &wc->refs[level - 1],
6852                                        &wc->flags[level - 1]);
6853         if (ret < 0) {
6854                 btrfs_tree_unlock(next);
6855                 return ret;
6856         }
6857
6858         BUG_ON(wc->refs[level - 1] == 0);
6859         *lookup_info = 0;
6860
6861         if (wc->stage == DROP_REFERENCE) {
6862                 if (wc->refs[level - 1] > 1) {
6863                         if (level == 1 &&
6864                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6865                                 goto skip;
6866
6867                         if (!wc->update_ref ||
6868                             generation <= root->root_key.offset)
6869                                 goto skip;
6870
6871                         btrfs_node_key_to_cpu(path->nodes[level], &key,
6872                                               path->slots[level]);
6873                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6874                         if (ret < 0)
6875                                 goto skip;
6876
6877                         wc->stage = UPDATE_BACKREF;
6878                         wc->shared_level = level - 1;
6879                 }
6880         } else {
6881                 if (level == 1 &&
6882                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6883                         goto skip;
6884         }
6885
6886         if (!btrfs_buffer_uptodate(next, generation, 0)) {
6887                 btrfs_tree_unlock(next);
6888                 free_extent_buffer(next);
6889                 next = NULL;
6890                 *lookup_info = 1;
6891         }
6892
6893         if (!next) {
6894                 if (reada && level == 1)
6895                         reada_walk_down(trans, root, wc, path);
6896                 next = read_tree_block(root, bytenr, blocksize, generation);
6897                 if (!next)
6898                         return -EIO;
6899                 btrfs_tree_lock(next);
6900                 btrfs_set_lock_blocking(next);
6901         }
6902
6903         level--;
6904         BUG_ON(level != btrfs_header_level(next));
6905         path->nodes[level] = next;
6906         path->slots[level] = 0;
6907         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6908         wc->level = level;
6909         if (wc->level == 1)
6910                 wc->reada_slot = 0;
6911         return 0;
6912 skip:
6913         wc->refs[level - 1] = 0;
6914         wc->flags[level - 1] = 0;
6915         if (wc->stage == DROP_REFERENCE) {
6916                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6917                         parent = path->nodes[level]->start;
6918                 } else {
6919                         BUG_ON(root->root_key.objectid !=
6920                                btrfs_header_owner(path->nodes[level]));
6921                         parent = 0;
6922                 }
6923
6924                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6925                                 root->root_key.objectid, level - 1, 0, 0);
6926                 BUG_ON(ret); /* -ENOMEM */
6927         }
6928         btrfs_tree_unlock(next);
6929         free_extent_buffer(next);
6930         *lookup_info = 1;
6931         return 1;
6932 }
6933
6934 /*
6935  * hepler to process tree block while walking up the tree.
6936  *
6937  * when wc->stage == DROP_REFERENCE, this function drops
6938  * reference count on the block.
6939  *
6940  * when wc->stage == UPDATE_BACKREF, this function changes
6941  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6942  * to UPDATE_BACKREF previously while processing the block.
6943  *
6944  * NOTE: return value 1 means we should stop walking up.
6945  */
6946 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6947                                  struct btrfs_root *root,
6948                                  struct btrfs_path *path,
6949                                  struct walk_control *wc)
6950 {
6951         int ret;
6952         int level = wc->level;
6953         struct extent_buffer *eb = path->nodes[level];
6954         u64 parent = 0;
6955
6956         if (wc->stage == UPDATE_BACKREF) {
6957                 BUG_ON(wc->shared_level < level);
6958                 if (level < wc->shared_level)
6959                         goto out;
6960
6961                 ret = find_next_key(path, level + 1, &wc->update_progress);
6962                 if (ret > 0)
6963                         wc->update_ref = 0;
6964
6965                 wc->stage = DROP_REFERENCE;
6966                 wc->shared_level = -1;
6967                 path->slots[level] = 0;
6968
6969                 /*
6970                  * check reference count again if the block isn't locked.
6971                  * we should start walking down the tree again if reference
6972                  * count is one.
6973                  */
6974                 if (!path->locks[level]) {
6975                         BUG_ON(level == 0);
6976                         btrfs_tree_lock(eb);
6977                         btrfs_set_lock_blocking(eb);
6978                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6979
6980                         ret = btrfs_lookup_extent_info(trans, root,
6981                                                        eb->start, eb->len,
6982                                                        &wc->refs[level],
6983                                                        &wc->flags[level]);
6984                         if (ret < 0) {
6985                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6986                                 path->locks[level] = 0;
6987                                 return ret;
6988                         }
6989                         BUG_ON(wc->refs[level] == 0);
6990                         if (wc->refs[level] == 1) {
6991                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6992                                 path->locks[level] = 0;
6993                                 return 1;
6994                         }
6995                 }
6996         }
6997
6998         /* wc->stage == DROP_REFERENCE */
6999         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
7000
7001         if (wc->refs[level] == 1) {
7002                 if (level == 0) {
7003                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7004                                 ret = btrfs_dec_ref(trans, root, eb, 1,
7005                                                     wc->for_reloc);
7006                         else
7007                                 ret = btrfs_dec_ref(trans, root, eb, 0,
7008                                                     wc->for_reloc);
7009                         BUG_ON(ret); /* -ENOMEM */
7010                 }
7011                 /* make block locked assertion in clean_tree_block happy */
7012                 if (!path->locks[level] &&
7013                     btrfs_header_generation(eb) == trans->transid) {
7014                         btrfs_tree_lock(eb);
7015                         btrfs_set_lock_blocking(eb);
7016                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7017                 }
7018                 clean_tree_block(trans, root, eb);
7019         }
7020
7021         if (eb == root->node) {
7022                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7023                         parent = eb->start;
7024                 else
7025                         BUG_ON(root->root_key.objectid !=
7026                                btrfs_header_owner(eb));
7027         } else {
7028                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7029                         parent = path->nodes[level + 1]->start;
7030                 else
7031                         BUG_ON(root->root_key.objectid !=
7032                                btrfs_header_owner(path->nodes[level + 1]));
7033         }
7034
7035         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
7036 out:
7037         wc->refs[level] = 0;
7038         wc->flags[level] = 0;
7039         return 0;
7040 }
7041
7042 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7043                                    struct btrfs_root *root,
7044                                    struct btrfs_path *path,
7045                                    struct walk_control *wc)
7046 {
7047         int level = wc->level;
7048         int lookup_info = 1;
7049         int ret;
7050
7051         while (level >= 0) {
7052                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
7053                 if (ret > 0)
7054                         break;
7055
7056                 if (level == 0)
7057                         break;
7058
7059                 if (path->slots[level] >=
7060                     btrfs_header_nritems(path->nodes[level]))
7061                         break;
7062
7063                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
7064                 if (ret > 0) {
7065                         path->slots[level]++;
7066                         continue;
7067                 } else if (ret < 0)
7068                         return ret;
7069                 level = wc->level;
7070         }
7071         return 0;
7072 }
7073
7074 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
7075                                  struct btrfs_root *root,
7076                                  struct btrfs_path *path,
7077                                  struct walk_control *wc, int max_level)
7078 {
7079         int level = wc->level;
7080         int ret;
7081
7082         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7083         while (level < max_level && path->nodes[level]) {
7084                 wc->level = level;
7085                 if (path->slots[level] + 1 <
7086                     btrfs_header_nritems(path->nodes[level])) {
7087                         path->slots[level]++;
7088                         return 0;
7089                 } else {
7090                         ret = walk_up_proc(trans, root, path, wc);
7091                         if (ret > 0)
7092                                 return 0;
7093
7094                         if (path->locks[level]) {
7095                                 btrfs_tree_unlock_rw(path->nodes[level],
7096                                                      path->locks[level]);
7097                                 path->locks[level] = 0;
7098                         }
7099                         free_extent_buffer(path->nodes[level]);
7100                         path->nodes[level] = NULL;
7101                         level++;
7102                 }
7103         }
7104         return 1;
7105 }
7106
7107 /*
7108  * drop a subvolume tree.
7109  *
7110  * this function traverses the tree freeing any blocks that only
7111  * referenced by the tree.
7112  *
7113  * when a shared tree block is found. this function decreases its
7114  * reference count by one. if update_ref is true, this function
7115  * also make sure backrefs for the shared block and all lower level
7116  * blocks are properly updated.
7117  */
7118 int btrfs_drop_snapshot(struct btrfs_root *root,
7119                          struct btrfs_block_rsv *block_rsv, int update_ref,
7120                          int for_reloc)
7121 {
7122         struct btrfs_path *path;
7123         struct btrfs_trans_handle *trans;
7124         struct btrfs_root *tree_root = root->fs_info->tree_root;
7125         struct btrfs_root_item *root_item = &root->root_item;
7126         struct walk_control *wc;
7127         struct btrfs_key key;
7128         int err = 0;
7129         int ret;
7130         int level;
7131
7132         path = btrfs_alloc_path();
7133         if (!path) {
7134                 err = -ENOMEM;
7135                 goto out;
7136         }
7137
7138         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7139         if (!wc) {
7140                 btrfs_free_path(path);
7141                 err = -ENOMEM;
7142                 goto out;
7143         }
7144
7145         trans = btrfs_start_transaction(tree_root, 0);
7146         if (IS_ERR(trans)) {
7147                 err = PTR_ERR(trans);
7148                 goto out_free;
7149         }
7150
7151         if (block_rsv)
7152                 trans->block_rsv = block_rsv;
7153
7154         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7155                 level = btrfs_header_level(root->node);
7156                 path->nodes[level] = btrfs_lock_root_node(root);
7157                 btrfs_set_lock_blocking(path->nodes[level]);
7158                 path->slots[level] = 0;
7159                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7160                 memset(&wc->update_progress, 0,
7161                        sizeof(wc->update_progress));
7162         } else {
7163                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7164                 memcpy(&wc->update_progress, &key,
7165                        sizeof(wc->update_progress));
7166
7167                 level = root_item->drop_level;
7168                 BUG_ON(level == 0);
7169                 path->lowest_level = level;
7170                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7171                 path->lowest_level = 0;
7172                 if (ret < 0) {
7173                         err = ret;
7174                         goto out_end_trans;
7175                 }
7176                 WARN_ON(ret > 0);
7177
7178                 /*
7179                  * unlock our path, this is safe because only this
7180                  * function is allowed to delete this snapshot
7181                  */
7182                 btrfs_unlock_up_safe(path, 0);
7183
7184                 level = btrfs_header_level(root->node);
7185                 while (1) {
7186                         btrfs_tree_lock(path->nodes[level]);
7187                         btrfs_set_lock_blocking(path->nodes[level]);
7188
7189                         ret = btrfs_lookup_extent_info(trans, root,
7190                                                 path->nodes[level]->start,
7191                                                 path->nodes[level]->len,
7192                                                 &wc->refs[level],
7193                                                 &wc->flags[level]);
7194                         if (ret < 0) {
7195                                 err = ret;
7196                                 goto out_end_trans;
7197                         }
7198                         BUG_ON(wc->refs[level] == 0);
7199
7200                         if (level == root_item->drop_level)
7201                                 break;
7202
7203                         btrfs_tree_unlock(path->nodes[level]);
7204                         WARN_ON(wc->refs[level] != 1);
7205                         level--;
7206                 }
7207         }
7208
7209         wc->level = level;
7210         wc->shared_level = -1;
7211         wc->stage = DROP_REFERENCE;
7212         wc->update_ref = update_ref;
7213         wc->keep_locks = 0;
7214         wc->for_reloc = for_reloc;
7215         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7216
7217         while (1) {
7218                 ret = walk_down_tree(trans, root, path, wc);
7219                 if (ret < 0) {
7220                         err = ret;
7221                         break;
7222                 }
7223
7224                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7225                 if (ret < 0) {
7226                         err = ret;
7227                         break;
7228                 }
7229
7230                 if (ret > 0) {
7231                         BUG_ON(wc->stage != DROP_REFERENCE);
7232                         break;
7233                 }
7234
7235                 if (wc->stage == DROP_REFERENCE) {
7236                         level = wc->level;
7237                         btrfs_node_key(path->nodes[level],
7238                                        &root_item->drop_progress,
7239                                        path->slots[level]);
7240                         root_item->drop_level = level;
7241                 }
7242
7243                 BUG_ON(wc->level == 0);
7244                 if (btrfs_should_end_transaction(trans, tree_root)) {
7245                         ret = btrfs_update_root(trans, tree_root,
7246                                                 &root->root_key,
7247                                                 root_item);
7248                         if (ret) {
7249                                 btrfs_abort_transaction(trans, tree_root, ret);
7250                                 err = ret;
7251                                 goto out_end_trans;
7252                         }
7253
7254                         btrfs_end_transaction_throttle(trans, tree_root);
7255                         trans = btrfs_start_transaction(tree_root, 0);
7256                         if (IS_ERR(trans)) {
7257                                 err = PTR_ERR(trans);
7258                                 goto out_free;
7259                         }
7260                         if (block_rsv)
7261                                 trans->block_rsv = block_rsv;
7262                 }
7263         }
7264         btrfs_release_path(path);
7265         if (err)
7266                 goto out_end_trans;
7267
7268         ret = btrfs_del_root(trans, tree_root, &root->root_key);
7269         if (ret) {
7270                 btrfs_abort_transaction(trans, tree_root, ret);
7271                 goto out_end_trans;
7272         }
7273
7274         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7275                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7276                                            NULL, NULL);
7277                 if (ret < 0) {
7278                         btrfs_abort_transaction(trans, tree_root, ret);
7279                         err = ret;
7280                         goto out_end_trans;
7281                 } else if (ret > 0) {
7282                         /* if we fail to delete the orphan item this time
7283                          * around, it'll get picked up the next time.
7284                          *
7285                          * The most common failure here is just -ENOENT.
7286                          */
7287                         btrfs_del_orphan_item(trans, tree_root,
7288                                               root->root_key.objectid);
7289                 }
7290         }
7291
7292         if (root->in_radix) {
7293                 btrfs_free_fs_root(tree_root->fs_info, root);
7294         } else {
7295                 free_extent_buffer(root->node);
7296                 free_extent_buffer(root->commit_root);
7297                 kfree(root);
7298         }
7299 out_end_trans:
7300         btrfs_end_transaction_throttle(trans, tree_root);
7301 out_free:
7302         kfree(wc);
7303         btrfs_free_path(path);
7304 out:
7305         if (err)
7306                 btrfs_std_error(root->fs_info, err);
7307         return err;
7308 }
7309
7310 /*
7311  * drop subtree rooted at tree block 'node'.
7312  *
7313  * NOTE: this function will unlock and release tree block 'node'
7314  * only used by relocation code
7315  */
7316 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7317                         struct btrfs_root *root,
7318                         struct extent_buffer *node,
7319                         struct extent_buffer *parent)
7320 {
7321         struct btrfs_path *path;
7322         struct walk_control *wc;
7323         int level;
7324         int parent_level;
7325         int ret = 0;
7326         int wret;
7327
7328         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7329
7330         path = btrfs_alloc_path();
7331         if (!path)
7332                 return -ENOMEM;
7333
7334         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7335         if (!wc) {
7336                 btrfs_free_path(path);
7337                 return -ENOMEM;
7338         }
7339
7340         btrfs_assert_tree_locked(parent);
7341         parent_level = btrfs_header_level(parent);
7342         extent_buffer_get(parent);
7343         path->nodes[parent_level] = parent;
7344         path->slots[parent_level] = btrfs_header_nritems(parent);
7345
7346         btrfs_assert_tree_locked(node);
7347         level = btrfs_header_level(node);
7348         path->nodes[level] = node;
7349         path->slots[level] = 0;
7350         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7351
7352         wc->refs[parent_level] = 1;
7353         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7354         wc->level = level;
7355         wc->shared_level = -1;
7356         wc->stage = DROP_REFERENCE;
7357         wc->update_ref = 0;
7358         wc->keep_locks = 1;
7359         wc->for_reloc = 1;
7360         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7361
7362         while (1) {
7363                 wret = walk_down_tree(trans, root, path, wc);
7364                 if (wret < 0) {
7365                         ret = wret;
7366                         break;
7367                 }
7368
7369                 wret = walk_up_tree(trans, root, path, wc, parent_level);
7370                 if (wret < 0)
7371                         ret = wret;
7372                 if (wret != 0)
7373                         break;
7374         }
7375
7376         kfree(wc);
7377         btrfs_free_path(path);
7378         return ret;
7379 }
7380
7381 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7382 {
7383         u64 num_devices;
7384         u64 stripped;
7385
7386         /*
7387          * if restripe for this chunk_type is on pick target profile and
7388          * return, otherwise do the usual balance
7389          */
7390         stripped = get_restripe_target(root->fs_info, flags);
7391         if (stripped)
7392                 return extended_to_chunk(stripped);
7393
7394         /*
7395          * we add in the count of missing devices because we want
7396          * to make sure that any RAID levels on a degraded FS
7397          * continue to be honored.
7398          */
7399         num_devices = root->fs_info->fs_devices->rw_devices +
7400                 root->fs_info->fs_devices->missing_devices;
7401
7402         stripped = BTRFS_BLOCK_GROUP_RAID0 |
7403                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7404                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7405
7406         if (num_devices == 1) {
7407                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7408                 stripped = flags & ~stripped;
7409
7410                 /* turn raid0 into single device chunks */
7411                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7412                         return stripped;
7413
7414                 /* turn mirroring into duplication */
7415                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7416                              BTRFS_BLOCK_GROUP_RAID10))
7417                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7418         } else {
7419                 /* they already had raid on here, just return */
7420                 if (flags & stripped)
7421                         return flags;
7422
7423                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7424                 stripped = flags & ~stripped;
7425
7426                 /* switch duplicated blocks with raid1 */
7427                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7428                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7429
7430                 /* this is drive concat, leave it alone */
7431         }
7432
7433         return flags;
7434 }
7435
7436 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7437 {
7438         struct btrfs_space_info *sinfo = cache->space_info;
7439         u64 num_bytes;
7440         u64 min_allocable_bytes;
7441         int ret = -ENOSPC;
7442
7443
7444         /*
7445          * We need some metadata space and system metadata space for
7446          * allocating chunks in some corner cases until we force to set
7447          * it to be readonly.
7448          */
7449         if ((sinfo->flags &
7450              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7451             !force)
7452                 min_allocable_bytes = 1 * 1024 * 1024;
7453         else
7454                 min_allocable_bytes = 0;
7455
7456         spin_lock(&sinfo->lock);
7457         spin_lock(&cache->lock);
7458
7459         if (cache->ro) {
7460                 ret = 0;
7461                 goto out;
7462         }
7463
7464         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7465                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7466
7467         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7468             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7469             min_allocable_bytes <= sinfo->total_bytes) {
7470                 sinfo->bytes_readonly += num_bytes;
7471                 cache->ro = 1;
7472                 ret = 0;
7473         }
7474 out:
7475         spin_unlock(&cache->lock);
7476         spin_unlock(&sinfo->lock);
7477         return ret;
7478 }
7479
7480 int btrfs_set_block_group_ro(struct btrfs_root *root,
7481                              struct btrfs_block_group_cache *cache)
7482
7483 {
7484         struct btrfs_trans_handle *trans;
7485         u64 alloc_flags;
7486         int ret;
7487
7488         BUG_ON(cache->ro);
7489
7490         trans = btrfs_join_transaction(root);
7491         if (IS_ERR(trans))
7492                 return PTR_ERR(trans);
7493
7494         alloc_flags = update_block_group_flags(root, cache->flags);
7495         if (alloc_flags != cache->flags) {
7496                 ret = do_chunk_alloc(trans, root, alloc_flags,
7497                                      CHUNK_ALLOC_FORCE);
7498                 if (ret < 0)
7499                         goto out;
7500         }
7501
7502         ret = set_block_group_ro(cache, 0);
7503         if (!ret)
7504                 goto out;
7505         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7506         ret = do_chunk_alloc(trans, root, alloc_flags,
7507                              CHUNK_ALLOC_FORCE);
7508         if (ret < 0)
7509                 goto out;
7510         ret = set_block_group_ro(cache, 0);
7511 out:
7512         btrfs_end_transaction(trans, root);
7513         return ret;
7514 }
7515
7516 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7517                             struct btrfs_root *root, u64 type)
7518 {
7519         u64 alloc_flags = get_alloc_profile(root, type);
7520         return do_chunk_alloc(trans, root, alloc_flags,
7521                               CHUNK_ALLOC_FORCE);
7522 }
7523
7524 /*
7525  * helper to account the unused space of all the readonly block group in the
7526  * list. takes mirrors into account.
7527  */
7528 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7529 {
7530         struct btrfs_block_group_cache *block_group;
7531         u64 free_bytes = 0;
7532         int factor;
7533
7534         list_for_each_entry(block_group, groups_list, list) {
7535                 spin_lock(&block_group->lock);
7536
7537                 if (!block_group->ro) {
7538                         spin_unlock(&block_group->lock);
7539                         continue;
7540                 }
7541
7542                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7543                                           BTRFS_BLOCK_GROUP_RAID10 |
7544                                           BTRFS_BLOCK_GROUP_DUP))
7545                         factor = 2;
7546                 else
7547                         factor = 1;
7548
7549                 free_bytes += (block_group->key.offset -
7550                                btrfs_block_group_used(&block_group->item)) *
7551                                factor;
7552
7553                 spin_unlock(&block_group->lock);
7554         }
7555
7556         return free_bytes;
7557 }
7558
7559 /*
7560  * helper to account the unused space of all the readonly block group in the
7561  * space_info. takes mirrors into account.
7562  */
7563 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7564 {
7565         int i;
7566         u64 free_bytes = 0;
7567
7568         spin_lock(&sinfo->lock);
7569
7570         for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7571                 if (!list_empty(&sinfo->block_groups[i]))
7572                         free_bytes += __btrfs_get_ro_block_group_free_space(
7573                                                 &sinfo->block_groups[i]);
7574
7575         spin_unlock(&sinfo->lock);
7576
7577         return free_bytes;
7578 }
7579
7580 void btrfs_set_block_group_rw(struct btrfs_root *root,
7581                               struct btrfs_block_group_cache *cache)
7582 {
7583         struct btrfs_space_info *sinfo = cache->space_info;
7584         u64 num_bytes;
7585
7586         BUG_ON(!cache->ro);
7587
7588         spin_lock(&sinfo->lock);
7589         spin_lock(&cache->lock);
7590         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7591                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7592         sinfo->bytes_readonly -= num_bytes;
7593         cache->ro = 0;
7594         spin_unlock(&cache->lock);
7595         spin_unlock(&sinfo->lock);
7596 }
7597
7598 /*
7599  * checks to see if its even possible to relocate this block group.
7600  *
7601  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7602  * ok to go ahead and try.
7603  */
7604 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7605 {
7606         struct btrfs_block_group_cache *block_group;
7607         struct btrfs_space_info *space_info;
7608         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7609         struct btrfs_device *device;
7610         u64 min_free;
7611         u64 dev_min = 1;
7612         u64 dev_nr = 0;
7613         u64 target;
7614         int index;
7615         int full = 0;
7616         int ret = 0;
7617
7618         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7619
7620         /* odd, couldn't find the block group, leave it alone */
7621         if (!block_group)
7622                 return -1;
7623
7624         min_free = btrfs_block_group_used(&block_group->item);
7625
7626         /* no bytes used, we're good */
7627         if (!min_free)
7628                 goto out;
7629
7630         space_info = block_group->space_info;
7631         spin_lock(&space_info->lock);
7632
7633         full = space_info->full;
7634
7635         /*
7636          * if this is the last block group we have in this space, we can't
7637          * relocate it unless we're able to allocate a new chunk below.
7638          *
7639          * Otherwise, we need to make sure we have room in the space to handle
7640          * all of the extents from this block group.  If we can, we're good
7641          */
7642         if ((space_info->total_bytes != block_group->key.offset) &&
7643             (space_info->bytes_used + space_info->bytes_reserved +
7644              space_info->bytes_pinned + space_info->bytes_readonly +
7645              min_free < space_info->total_bytes)) {
7646                 spin_unlock(&space_info->lock);
7647                 goto out;
7648         }
7649         spin_unlock(&space_info->lock);
7650
7651         /*
7652          * ok we don't have enough space, but maybe we have free space on our
7653          * devices to allocate new chunks for relocation, so loop through our
7654          * alloc devices and guess if we have enough space.  if this block
7655          * group is going to be restriped, run checks against the target
7656          * profile instead of the current one.
7657          */
7658         ret = -1;
7659
7660         /*
7661          * index:
7662          *      0: raid10
7663          *      1: raid1
7664          *      2: dup
7665          *      3: raid0
7666          *      4: single
7667          */
7668         target = get_restripe_target(root->fs_info, block_group->flags);
7669         if (target) {
7670                 index = __get_raid_index(extended_to_chunk(target));
7671         } else {
7672                 /*
7673                  * this is just a balance, so if we were marked as full
7674                  * we know there is no space for a new chunk
7675                  */
7676                 if (full)
7677                         goto out;
7678
7679                 index = get_block_group_index(block_group);
7680         }
7681
7682         if (index == BTRFS_RAID_RAID10) {
7683                 dev_min = 4;
7684                 /* Divide by 2 */
7685                 min_free >>= 1;
7686         } else if (index == BTRFS_RAID_RAID1) {
7687                 dev_min = 2;
7688         } else if (index == BTRFS_RAID_DUP) {
7689                 /* Multiply by 2 */
7690                 min_free <<= 1;
7691         } else if (index == BTRFS_RAID_RAID0) {
7692                 dev_min = fs_devices->rw_devices;
7693                 do_div(min_free, dev_min);
7694         }
7695
7696         mutex_lock(&root->fs_info->chunk_mutex);
7697         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7698                 u64 dev_offset;
7699
7700                 /*
7701                  * check to make sure we can actually find a chunk with enough
7702                  * space to fit our block group in.
7703                  */
7704                 if (device->total_bytes > device->bytes_used + min_free &&
7705                     !device->is_tgtdev_for_dev_replace) {
7706                         ret = find_free_dev_extent(device, min_free,
7707                                                    &dev_offset, NULL);
7708                         if (!ret)
7709                                 dev_nr++;
7710
7711                         if (dev_nr >= dev_min)
7712                                 break;
7713
7714                         ret = -1;
7715                 }
7716         }
7717         mutex_unlock(&root->fs_info->chunk_mutex);
7718 out:
7719         btrfs_put_block_group(block_group);
7720         return ret;
7721 }
7722
7723 static int find_first_block_group(struct btrfs_root *root,
7724                 struct btrfs_path *path, struct btrfs_key *key)
7725 {
7726         int ret = 0;
7727         struct btrfs_key found_key;
7728         struct extent_buffer *leaf;
7729         int slot;
7730
7731         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7732         if (ret < 0)
7733                 goto out;
7734
7735         while (1) {
7736                 slot = path->slots[0];
7737                 leaf = path->nodes[0];
7738                 if (slot >= btrfs_header_nritems(leaf)) {
7739                         ret = btrfs_next_leaf(root, path);
7740                         if (ret == 0)
7741                                 continue;
7742                         if (ret < 0)
7743                                 goto out;
7744                         break;
7745                 }
7746                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7747
7748                 if (found_key.objectid >= key->objectid &&
7749                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7750                         ret = 0;
7751                         goto out;
7752                 }
7753                 path->slots[0]++;
7754         }
7755 out:
7756         return ret;
7757 }
7758
7759 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7760 {
7761         struct btrfs_block_group_cache *block_group;
7762         u64 last = 0;
7763
7764         while (1) {
7765                 struct inode *inode;
7766
7767                 block_group = btrfs_lookup_first_block_group(info, last);
7768                 while (block_group) {
7769                         spin_lock(&block_group->lock);
7770                         if (block_group->iref)
7771                                 break;
7772                         spin_unlock(&block_group->lock);
7773                         block_group = next_block_group(info->tree_root,
7774                                                        block_group);
7775                 }
7776                 if (!block_group) {
7777                         if (last == 0)
7778                                 break;
7779                         last = 0;
7780                         continue;
7781                 }
7782
7783                 inode = block_group->inode;
7784                 block_group->iref = 0;
7785                 block_group->inode = NULL;
7786                 spin_unlock(&block_group->lock);
7787                 iput(inode);
7788                 last = block_group->key.objectid + block_group->key.offset;
7789                 btrfs_put_block_group(block_group);
7790         }
7791 }
7792
7793 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7794 {
7795         struct btrfs_block_group_cache *block_group;
7796         struct btrfs_space_info *space_info;
7797         struct btrfs_caching_control *caching_ctl;
7798         struct rb_node *n;
7799
7800         down_write(&info->extent_commit_sem);
7801         while (!list_empty(&info->caching_block_groups)) {
7802                 caching_ctl = list_entry(info->caching_block_groups.next,
7803                                          struct btrfs_caching_control, list);
7804                 list_del(&caching_ctl->list);
7805                 put_caching_control(caching_ctl);
7806         }
7807         up_write(&info->extent_commit_sem);
7808
7809         spin_lock(&info->block_group_cache_lock);
7810         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7811                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7812                                        cache_node);
7813                 rb_erase(&block_group->cache_node,
7814                          &info->block_group_cache_tree);
7815                 spin_unlock(&info->block_group_cache_lock);
7816
7817                 down_write(&block_group->space_info->groups_sem);
7818                 list_del(&block_group->list);
7819                 up_write(&block_group->space_info->groups_sem);
7820
7821                 if (block_group->cached == BTRFS_CACHE_STARTED)
7822                         wait_block_group_cache_done(block_group);
7823
7824                 /*
7825                  * We haven't cached this block group, which means we could
7826                  * possibly have excluded extents on this block group.
7827                  */
7828                 if (block_group->cached == BTRFS_CACHE_NO)
7829                         free_excluded_extents(info->extent_root, block_group);
7830
7831                 btrfs_remove_free_space_cache(block_group);
7832                 btrfs_put_block_group(block_group);
7833
7834                 spin_lock(&info->block_group_cache_lock);
7835         }
7836         spin_unlock(&info->block_group_cache_lock);
7837
7838         /* now that all the block groups are freed, go through and
7839          * free all the space_info structs.  This is only called during
7840          * the final stages of unmount, and so we know nobody is
7841          * using them.  We call synchronize_rcu() once before we start,
7842          * just to be on the safe side.
7843          */
7844         synchronize_rcu();
7845
7846         release_global_block_rsv(info);
7847
7848         while(!list_empty(&info->space_info)) {
7849                 space_info = list_entry(info->space_info.next,
7850                                         struct btrfs_space_info,
7851                                         list);
7852                 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
7853                         if (space_info->bytes_pinned > 0 ||
7854                             space_info->bytes_reserved > 0 ||
7855                             space_info->bytes_may_use > 0) {
7856                                 WARN_ON(1);
7857                                 dump_space_info(space_info, 0, 0);
7858                         }
7859                 }
7860                 list_del(&space_info->list);
7861                 kfree(space_info);
7862         }
7863         return 0;
7864 }
7865
7866 static void __link_block_group(struct btrfs_space_info *space_info,
7867                                struct btrfs_block_group_cache *cache)
7868 {
7869         int index = get_block_group_index(cache);
7870
7871         down_write(&space_info->groups_sem);
7872         list_add_tail(&cache->list, &space_info->block_groups[index]);
7873         up_write(&space_info->groups_sem);
7874 }
7875
7876 int btrfs_read_block_groups(struct btrfs_root *root)
7877 {
7878         struct btrfs_path *path;
7879         int ret;
7880         struct btrfs_block_group_cache *cache;
7881         struct btrfs_fs_info *info = root->fs_info;
7882         struct btrfs_space_info *space_info;
7883         struct btrfs_key key;
7884         struct btrfs_key found_key;
7885         struct extent_buffer *leaf;
7886         int need_clear = 0;
7887         u64 cache_gen;
7888
7889         root = info->extent_root;
7890         key.objectid = 0;
7891         key.offset = 0;
7892         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7893         path = btrfs_alloc_path();
7894         if (!path)
7895                 return -ENOMEM;
7896         path->reada = 1;
7897
7898         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7899         if (btrfs_test_opt(root, SPACE_CACHE) &&
7900             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7901                 need_clear = 1;
7902         if (btrfs_test_opt(root, CLEAR_CACHE))
7903                 need_clear = 1;
7904
7905         while (1) {
7906                 ret = find_first_block_group(root, path, &key);
7907                 if (ret > 0)
7908                         break;
7909                 if (ret != 0)
7910                         goto error;
7911                 leaf = path->nodes[0];
7912                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7913                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7914                 if (!cache) {
7915                         ret = -ENOMEM;
7916                         goto error;
7917                 }
7918                 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7919                                                 GFP_NOFS);
7920                 if (!cache->free_space_ctl) {
7921                         kfree(cache);
7922                         ret = -ENOMEM;
7923                         goto error;
7924                 }
7925
7926                 atomic_set(&cache->count, 1);
7927                 spin_lock_init(&cache->lock);
7928                 cache->fs_info = info;
7929                 INIT_LIST_HEAD(&cache->list);
7930                 INIT_LIST_HEAD(&cache->cluster_list);
7931
7932                 if (need_clear) {
7933                         /*
7934                          * When we mount with old space cache, we need to
7935                          * set BTRFS_DC_CLEAR and set dirty flag.
7936                          *
7937                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7938                          *    truncate the old free space cache inode and
7939                          *    setup a new one.
7940                          * b) Setting 'dirty flag' makes sure that we flush
7941                          *    the new space cache info onto disk.
7942                          */
7943                         cache->disk_cache_state = BTRFS_DC_CLEAR;
7944                         if (btrfs_test_opt(root, SPACE_CACHE))
7945                                 cache->dirty = 1;
7946                 }
7947
7948                 read_extent_buffer(leaf, &cache->item,
7949                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7950                                    sizeof(cache->item));
7951                 memcpy(&cache->key, &found_key, sizeof(found_key));
7952
7953                 key.objectid = found_key.objectid + found_key.offset;
7954                 btrfs_release_path(path);
7955                 cache->flags = btrfs_block_group_flags(&cache->item);
7956                 cache->sectorsize = root->sectorsize;
7957                 cache->full_stripe_len = btrfs_full_stripe_len(root,
7958                                                &root->fs_info->mapping_tree,
7959                                                found_key.objectid);
7960                 btrfs_init_free_space_ctl(cache);
7961
7962                 /*
7963                  * We need to exclude the super stripes now so that the space
7964                  * info has super bytes accounted for, otherwise we'll think
7965                  * we have more space than we actually do.
7966                  */
7967                 exclude_super_stripes(root, cache);
7968
7969                 /*
7970                  * check for two cases, either we are full, and therefore
7971                  * don't need to bother with the caching work since we won't
7972                  * find any space, or we are empty, and we can just add all
7973                  * the space in and be done with it.  This saves us _alot_ of
7974                  * time, particularly in the full case.
7975                  */
7976                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7977                         cache->last_byte_to_unpin = (u64)-1;
7978                         cache->cached = BTRFS_CACHE_FINISHED;
7979                         free_excluded_extents(root, cache);
7980                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7981                         cache->last_byte_to_unpin = (u64)-1;
7982                         cache->cached = BTRFS_CACHE_FINISHED;
7983                         add_new_free_space(cache, root->fs_info,
7984                                            found_key.objectid,
7985                                            found_key.objectid +
7986                                            found_key.offset);
7987                         free_excluded_extents(root, cache);
7988                 }
7989
7990                 ret = update_space_info(info, cache->flags, found_key.offset,
7991                                         btrfs_block_group_used(&cache->item),
7992                                         &space_info);
7993                 BUG_ON(ret); /* -ENOMEM */
7994                 cache->space_info = space_info;
7995                 spin_lock(&cache->space_info->lock);
7996                 cache->space_info->bytes_readonly += cache->bytes_super;
7997                 spin_unlock(&cache->space_info->lock);
7998
7999                 __link_block_group(space_info, cache);
8000
8001                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
8002                 BUG_ON(ret); /* Logic error */
8003
8004                 set_avail_alloc_bits(root->fs_info, cache->flags);
8005                 if (btrfs_chunk_readonly(root, cache->key.objectid))
8006                         set_block_group_ro(cache, 1);
8007         }
8008
8009         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8010                 if (!(get_alloc_profile(root, space_info->flags) &
8011                       (BTRFS_BLOCK_GROUP_RAID10 |
8012                        BTRFS_BLOCK_GROUP_RAID1 |
8013                        BTRFS_BLOCK_GROUP_RAID5 |
8014                        BTRFS_BLOCK_GROUP_RAID6 |
8015                        BTRFS_BLOCK_GROUP_DUP)))
8016                         continue;
8017                 /*
8018                  * avoid allocating from un-mirrored block group if there are
8019                  * mirrored block groups.
8020                  */
8021                 list_for_each_entry(cache, &space_info->block_groups[3], list)
8022                         set_block_group_ro(cache, 1);
8023                 list_for_each_entry(cache, &space_info->block_groups[4], list)
8024                         set_block_group_ro(cache, 1);
8025         }
8026
8027         init_global_block_rsv(info);
8028         ret = 0;
8029 error:
8030         btrfs_free_path(path);
8031         return ret;
8032 }
8033
8034 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8035                                        struct btrfs_root *root)
8036 {
8037         struct btrfs_block_group_cache *block_group, *tmp;
8038         struct btrfs_root *extent_root = root->fs_info->extent_root;
8039         struct btrfs_block_group_item item;
8040         struct btrfs_key key;
8041         int ret = 0;
8042
8043         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
8044                                  new_bg_list) {
8045                 list_del_init(&block_group->new_bg_list);
8046
8047                 if (ret)
8048                         continue;
8049
8050                 spin_lock(&block_group->lock);
8051                 memcpy(&item, &block_group->item, sizeof(item));
8052                 memcpy(&key, &block_group->key, sizeof(key));
8053                 spin_unlock(&block_group->lock);
8054
8055                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
8056                                         sizeof(item));
8057                 if (ret)
8058                         btrfs_abort_transaction(trans, extent_root, ret);
8059         }
8060 }
8061
8062 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8063                            struct btrfs_root *root, u64 bytes_used,
8064                            u64 type, u64 chunk_objectid, u64 chunk_offset,
8065                            u64 size)
8066 {
8067         int ret;
8068         struct btrfs_root *extent_root;
8069         struct btrfs_block_group_cache *cache;
8070
8071         extent_root = root->fs_info->extent_root;
8072
8073         root->fs_info->last_trans_log_full_commit = trans->transid;
8074
8075         cache = kzalloc(sizeof(*cache), GFP_NOFS);
8076         if (!cache)
8077                 return -ENOMEM;
8078         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8079                                         GFP_NOFS);
8080         if (!cache->free_space_ctl) {
8081                 kfree(cache);
8082                 return -ENOMEM;
8083         }
8084
8085         cache->key.objectid = chunk_offset;
8086         cache->key.offset = size;
8087         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8088         cache->sectorsize = root->sectorsize;
8089         cache->fs_info = root->fs_info;
8090         cache->full_stripe_len = btrfs_full_stripe_len(root,
8091                                                &root->fs_info->mapping_tree,
8092                                                chunk_offset);
8093
8094         atomic_set(&cache->count, 1);
8095         spin_lock_init(&cache->lock);
8096         INIT_LIST_HEAD(&cache->list);
8097         INIT_LIST_HEAD(&cache->cluster_list);
8098         INIT_LIST_HEAD(&cache->new_bg_list);
8099
8100         btrfs_init_free_space_ctl(cache);
8101
8102         btrfs_set_block_group_used(&cache->item, bytes_used);
8103         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8104         cache->flags = type;
8105         btrfs_set_block_group_flags(&cache->item, type);
8106
8107         cache->last_byte_to_unpin = (u64)-1;
8108         cache->cached = BTRFS_CACHE_FINISHED;
8109         exclude_super_stripes(root, cache);
8110
8111         add_new_free_space(cache, root->fs_info, chunk_offset,
8112                            chunk_offset + size);
8113
8114         free_excluded_extents(root, cache);
8115
8116         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8117                                 &cache->space_info);
8118         BUG_ON(ret); /* -ENOMEM */
8119         update_global_block_rsv(root->fs_info);
8120
8121         spin_lock(&cache->space_info->lock);
8122         cache->space_info->bytes_readonly += cache->bytes_super;
8123         spin_unlock(&cache->space_info->lock);
8124
8125         __link_block_group(cache->space_info, cache);
8126
8127         ret = btrfs_add_block_group_cache(root->fs_info, cache);
8128         BUG_ON(ret); /* Logic error */
8129
8130         list_add_tail(&cache->new_bg_list, &trans->new_bgs);
8131
8132         set_avail_alloc_bits(extent_root->fs_info, type);
8133
8134         return 0;
8135 }
8136
8137 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
8138 {
8139         u64 extra_flags = chunk_to_extended(flags) &
8140                                 BTRFS_EXTENDED_PROFILE_MASK;
8141
8142         write_seqlock(&fs_info->profiles_lock);
8143         if (flags & BTRFS_BLOCK_GROUP_DATA)
8144                 fs_info->avail_data_alloc_bits &= ~extra_flags;
8145         if (flags & BTRFS_BLOCK_GROUP_METADATA)
8146                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8147         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8148                 fs_info->avail_system_alloc_bits &= ~extra_flags;
8149         write_sequnlock(&fs_info->profiles_lock);
8150 }
8151
8152 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8153                              struct btrfs_root *root, u64 group_start)
8154 {
8155         struct btrfs_path *path;
8156         struct btrfs_block_group_cache *block_group;
8157         struct btrfs_free_cluster *cluster;
8158         struct btrfs_root *tree_root = root->fs_info->tree_root;
8159         struct btrfs_key key;
8160         struct inode *inode;
8161         int ret;
8162         int index;
8163         int factor;
8164
8165         root = root->fs_info->extent_root;
8166
8167         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8168         BUG_ON(!block_group);
8169         BUG_ON(!block_group->ro);
8170
8171         /*
8172          * Free the reserved super bytes from this block group before
8173          * remove it.
8174          */
8175         free_excluded_extents(root, block_group);
8176
8177         memcpy(&key, &block_group->key, sizeof(key));
8178         index = get_block_group_index(block_group);
8179         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8180                                   BTRFS_BLOCK_GROUP_RAID1 |
8181                                   BTRFS_BLOCK_GROUP_RAID10))
8182                 factor = 2;
8183         else
8184                 factor = 1;
8185
8186         /* make sure this block group isn't part of an allocation cluster */
8187         cluster = &root->fs_info->data_alloc_cluster;
8188         spin_lock(&cluster->refill_lock);
8189         btrfs_return_cluster_to_free_space(block_group, cluster);
8190         spin_unlock(&cluster->refill_lock);
8191
8192         /*
8193          * make sure this block group isn't part of a metadata
8194          * allocation cluster
8195          */
8196         cluster = &root->fs_info->meta_alloc_cluster;
8197         spin_lock(&cluster->refill_lock);
8198         btrfs_return_cluster_to_free_space(block_group, cluster);
8199         spin_unlock(&cluster->refill_lock);
8200
8201         path = btrfs_alloc_path();
8202         if (!path) {
8203                 ret = -ENOMEM;
8204                 goto out;
8205         }
8206
8207         inode = lookup_free_space_inode(tree_root, block_group, path);
8208         if (!IS_ERR(inode)) {
8209                 ret = btrfs_orphan_add(trans, inode);
8210                 if (ret) {
8211                         btrfs_add_delayed_iput(inode);
8212                         goto out;
8213                 }
8214                 clear_nlink(inode);
8215                 /* One for the block groups ref */
8216                 spin_lock(&block_group->lock);
8217                 if (block_group->iref) {
8218                         block_group->iref = 0;
8219                         block_group->inode = NULL;
8220                         spin_unlock(&block_group->lock);
8221                         iput(inode);
8222                 } else {
8223                         spin_unlock(&block_group->lock);
8224                 }
8225                 /* One for our lookup ref */
8226                 btrfs_add_delayed_iput(inode);
8227         }
8228
8229         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8230         key.offset = block_group->key.objectid;
8231         key.type = 0;
8232
8233         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8234         if (ret < 0)
8235                 goto out;
8236         if (ret > 0)
8237                 btrfs_release_path(path);
8238         if (ret == 0) {
8239                 ret = btrfs_del_item(trans, tree_root, path);
8240                 if (ret)
8241                         goto out;
8242                 btrfs_release_path(path);
8243         }
8244
8245         spin_lock(&root->fs_info->block_group_cache_lock);
8246         rb_erase(&block_group->cache_node,
8247                  &root->fs_info->block_group_cache_tree);
8248
8249         if (root->fs_info->first_logical_byte == block_group->key.objectid)
8250                 root->fs_info->first_logical_byte = (u64)-1;
8251         spin_unlock(&root->fs_info->block_group_cache_lock);
8252
8253         down_write(&block_group->space_info->groups_sem);
8254         /*
8255          * we must use list_del_init so people can check to see if they
8256          * are still on the list after taking the semaphore
8257          */
8258         list_del_init(&block_group->list);
8259         if (list_empty(&block_group->space_info->block_groups[index]))
8260                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8261         up_write(&block_group->space_info->groups_sem);
8262
8263         if (block_group->cached == BTRFS_CACHE_STARTED)
8264                 wait_block_group_cache_done(block_group);
8265
8266         btrfs_remove_free_space_cache(block_group);
8267
8268         spin_lock(&block_group->space_info->lock);
8269         block_group->space_info->total_bytes -= block_group->key.offset;
8270         block_group->space_info->bytes_readonly -= block_group->key.offset;
8271         block_group->space_info->disk_total -= block_group->key.offset * factor;
8272         spin_unlock(&block_group->space_info->lock);
8273
8274         memcpy(&key, &block_group->key, sizeof(key));
8275
8276         btrfs_clear_space_info_full(root->fs_info);
8277
8278         btrfs_put_block_group(block_group);
8279         btrfs_put_block_group(block_group);
8280
8281         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8282         if (ret > 0)
8283                 ret = -EIO;
8284         if (ret < 0)
8285                 goto out;
8286
8287         ret = btrfs_del_item(trans, root, path);
8288 out:
8289         btrfs_free_path(path);
8290         return ret;
8291 }
8292
8293 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8294 {
8295         struct btrfs_space_info *space_info;
8296         struct btrfs_super_block *disk_super;
8297         u64 features;
8298         u64 flags;
8299         int mixed = 0;
8300         int ret;
8301
8302         disk_super = fs_info->super_copy;
8303         if (!btrfs_super_root(disk_super))
8304                 return 1;
8305
8306         features = btrfs_super_incompat_flags(disk_super);
8307         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8308                 mixed = 1;
8309
8310         flags = BTRFS_BLOCK_GROUP_SYSTEM;
8311         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8312         if (ret)
8313                 goto out;
8314
8315         if (mixed) {
8316                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8317                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8318         } else {
8319                 flags = BTRFS_BLOCK_GROUP_METADATA;
8320                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8321                 if (ret)
8322                         goto out;
8323
8324                 flags = BTRFS_BLOCK_GROUP_DATA;
8325                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8326         }
8327 out:
8328         return ret;
8329 }
8330
8331 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8332 {
8333         return unpin_extent_range(root, start, end);
8334 }
8335
8336 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8337                                u64 num_bytes, u64 *actual_bytes)
8338 {
8339         return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8340 }
8341
8342 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8343 {
8344         struct btrfs_fs_info *fs_info = root->fs_info;
8345         struct btrfs_block_group_cache *cache = NULL;
8346         u64 group_trimmed;
8347         u64 start;
8348         u64 end;
8349         u64 trimmed = 0;
8350         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8351         int ret = 0;
8352
8353         /*
8354          * try to trim all FS space, our block group may start from non-zero.
8355          */
8356         if (range->len == total_bytes)
8357                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
8358         else
8359                 cache = btrfs_lookup_block_group(fs_info, range->start);
8360
8361         while (cache) {
8362                 if (cache->key.objectid >= (range->start + range->len)) {
8363                         btrfs_put_block_group(cache);
8364                         break;
8365                 }
8366
8367                 start = max(range->start, cache->key.objectid);
8368                 end = min(range->start + range->len,
8369                                 cache->key.objectid + cache->key.offset);
8370
8371                 if (end - start >= range->minlen) {
8372                         if (!block_group_cache_done(cache)) {
8373                                 ret = cache_block_group(cache, 0);
8374                                 if (!ret)
8375                                         wait_block_group_cache_done(cache);
8376                         }
8377                         ret = btrfs_trim_block_group(cache,
8378                                                      &group_trimmed,
8379                                                      start,
8380                                                      end,
8381                                                      range->minlen);
8382
8383                         trimmed += group_trimmed;
8384                         if (ret) {
8385                                 btrfs_put_block_group(cache);
8386                                 break;
8387                         }
8388                 }
8389
8390                 cache = next_block_group(fs_info->tree_root, cache);
8391         }
8392
8393         range->len = trimmed;
8394         return ret;
8395 }