Btrfs: qgroup implementation and prototypes

author Arne Jansen <sensille@gmx.net>

Thu, 28 Jun 2012 16:03:02 +0000 (18:03 +0200)

committer Jan Schmidt <list.btrfs@jan-o-sch.net>

Thu, 12 Jul 2012 08:54:21 +0000 (10:54 +0200)
author Arne Jansen <sensille@gmx.net>
Thu, 28 Jun 2012 16:03:02 +0000 (18:03 +0200)
committer Jan Schmidt <list.btrfs@jan-o-sch.net>
Thu, 12 Jul 2012 08:54:21 +0000 (10:54 +0200)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile

index 0c4fa2befae793f1a6845322d7ba71aaa5da4374..0bc4d3a10a5fe631b6a85873a6b44b72f6bce4c6 100644 (file)
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
            extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
            export.o tree-log.o free-space-cache.o zlib.o lzo.o \
            compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-          reada.o backref.o ulist.o
+          reada.o backref.o ulist.o qgroup.o
  
  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index a5269d4a164f2bdface78af68c64e33ccab2221d..ccba9b684c965043883eb6d2c1561b9be8ecc930 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2830,6 +2830,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
  
  int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
@@ -3339,6 +3341,50 @@ void btrfs_reada_detach(void *handle);
  int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                          u64 start, int err);
  
+/* qgroup.c */
+struct qgroup_update {
+       struct list_head list;
+       struct btrfs_delayed_ref_node *node;
+       struct btrfs_delayed_extent_op *extent_op;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info);
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid,
+                       char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info, u64 qgroupid,
+                      struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_node *node,
+                           struct btrfs_delayed_extent_op *extent_op);
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info,
+                            struct btrfs_delayed_ref_node *node,
+                            struct btrfs_delayed_extent_op *extent_op);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+                     struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+                        struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+                        struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
  static inline int is_fstree(u64 rootid)
  {
         if (rootid == BTRFS_FS_TREE_OBJECTID ||
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index b13f1fbc3733d52db2b282f45cbe734bc29d0edd..1a63b830846dc261ea1c59e78271966c325c35d2 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2409,6 +2409,40 @@ static u64 find_middle(struct rb_root *root)
  }
  #endif
  
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info)
+{
+       struct qgroup_update *qgroup_update;
+       int ret = 0;
+
+       if (list_empty(&trans->qgroup_ref_list) !=
+           !trans->delayed_ref_elem.seq) {
+               /* list without seq or seq without list */
+               printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+                       list_empty(&trans->qgroup_ref_list) ? "" : " not",
+                       trans->delayed_ref_elem.seq);
+               BUG();
+       }
+
+       if (!trans->delayed_ref_elem.seq)
+               return 0;
+
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+                                                struct qgroup_update, list);
+               list_del(&qgroup_update->list);
+               if (!ret)
+                       ret = btrfs_qgroup_account_ref(
+                                       trans, fs_info, qgroup_update->node,
+                                       qgroup_update->extent_op);
+               kfree(qgroup_update);
+       }
+
+       btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+       return ret;
+}
+
  /*
   * this starts processing the delayed reference count updates and
   * extent insertions we have queued up so far.  count can be
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h

index e440aa653c30d6f6c8ad1e7437bfa6e2b4d50799..a8a2230f4c5c4180d91db74ff4ba4f9f2fbf1349 100644 (file)
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -35,6 +35,30 @@ struct btrfs_ioctl_vol_args {
  #define BTRFS_FSID_SIZE 16
  #define BTRFS_UUID_SIZE 16
  
+#define BTRFS_QGROUP_INHERIT_SET_LIMITS        (1ULL << 0)
+
+struct btrfs_qgroup_limit {
+       __u64   flags;
+       __u64   max_rfer;
+       __u64   max_excl;
+       __u64   rsv_rfer;
+       __u64   rsv_excl;
+};
+
+struct btrfs_qgroup_inherit {
+       __u64   flags;
+       __u64   num_qgroups;
+       __u64   num_ref_copies;
+       __u64   num_excl_copies;
+       struct btrfs_qgroup_limit lim;
+       __u64   qgroups[0];
+};
+
+struct btrfs_ioctl_qgroup_limit_args {
+       __u64   qgroupid;
+       struct btrfs_qgroup_limit lim;
+};
+
  #define BTRFS_SUBVOL_NAME_MAX 4039
  struct btrfs_ioctl_vol_args_v2 {
         __s64 fd;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c

new file mode 100644 (file)

index 0000000..bc424ae
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,1571 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "ioctl.h"
+#include "backref.h"
+
+/* TODO XXX FIXME
+ *  - subvol delete -> delete when ref goes to 0? delete limits also?
+ *  - reorganize keys
+ *  - compressed
+ *  - sync
+ *  - rescan
+ *  - copy also limits on subvol creation
+ *  - limit
+ *  - caches fuer ulists
+ *  - performance benchmarks
+ *  - check all ioctl parameters
+ */
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+       u64 qgroupid;
+
+       /*
+        * state
+        */
+       u64 rfer;       /* referenced */
+       u64 rfer_cmpr;  /* referenced compressed */
+       u64 excl;       /* exclusive */
+       u64 excl_cmpr;  /* exclusive compressed */
+
+       /*
+        * limits
+        */
+       u64 lim_flags;  /* which limits are set */
+       u64 max_rfer;
+       u64 max_excl;
+       u64 rsv_rfer;
+       u64 rsv_excl;
+
+       /*
+        * reservation tracking
+        */
+       u64 reserved;
+
+       /*
+        * lists
+        */
+       struct list_head groups;  /* groups this group is member of */
+       struct list_head members; /* groups that are members of this group */
+       struct list_head dirty;   /* dirty groups */
+       struct rb_node node;      /* tree of qgroups */
+
+       /*
+        * temp variables for accounting operations
+        */
+       u64 tag;
+       u64 refcnt;
+};
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+       struct list_head next_group;
+       struct list_head next_member;
+       struct btrfs_qgroup *group;
+       struct btrfs_qgroup *member;
+};
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+                                          u64 qgroupid)
+{
+       struct rb_node *n = fs_info->qgroup_tree.rb_node;
+       struct btrfs_qgroup *qgroup;
+
+       while (n) {
+               qgroup = rb_entry(n, struct btrfs_qgroup, node);
+               if (qgroup->qgroupid < qgroupid)
+                       n = n->rb_left;
+               else if (qgroup->qgroupid > qgroupid)
+                       n = n->rb_right;
+               else
+                       return qgroup;
+       }
+       return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+                                         u64 qgroupid)
+{
+       struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct btrfs_qgroup *qgroup;
+
+       while (*p) {
+               parent = *p;
+               qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+               if (qgroup->qgroupid < qgroupid)
+                       p = &(*p)->rb_left;
+               else if (qgroup->qgroupid > qgroupid)
+                       p = &(*p)->rb_right;
+               else
+                       return qgroup;
+       }
+
+       qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+       if (!qgroup)
+               return ERR_PTR(-ENOMEM);
+
+       qgroup->qgroupid = qgroupid;
+       INIT_LIST_HEAD(&qgroup->groups);
+       INIT_LIST_HEAD(&qgroup->members);
+       INIT_LIST_HEAD(&qgroup->dirty);
+
+       rb_link_node(&qgroup->node, parent, p);
+       rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+       return qgroup;
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+       struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+       struct btrfs_qgroup_list *list;
+
+       if (!qgroup)
+               return -ENOENT;
+
+       rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+       list_del(&qgroup->dirty);
+
+       while (!list_empty(&qgroup->groups)) {
+               list = list_first_entry(&qgroup->groups,
+                                       struct btrfs_qgroup_list, next_group);
+               list_del(&list->next_group);
+               list_del(&list->next_member);
+               kfree(list);
+       }
+
+       while (!list_empty(&qgroup->members)) {
+               list = list_first_entry(&qgroup->members,
+                                       struct btrfs_qgroup_list, next_member);
+               list_del(&list->next_group);
+               list_del(&list->next_member);
+               kfree(list);
+       }
+       kfree(qgroup);
+
+       return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+                          u64 memberid, u64 parentid)
+{
+       struct btrfs_qgroup *member;
+       struct btrfs_qgroup *parent;
+       struct btrfs_qgroup_list *list;
+
+       member = find_qgroup_rb(fs_info, memberid);
+       parent = find_qgroup_rb(fs_info, parentid);
+       if (!member || !parent)
+               return -ENOENT;
+
+       list = kzalloc(sizeof(*list), GFP_ATOMIC);
+       if (!list)
+               return -ENOMEM;
+
+       list->group = parent;
+       list->member = member;
+       list_add_tail(&list->next_group, &member->groups);
+       list_add_tail(&list->next_member, &parent->members);
+
+       return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+                          u64 memberid, u64 parentid)
+{
+       struct btrfs_qgroup *member;
+       struct btrfs_qgroup *parent;
+       struct btrfs_qgroup_list *list;
+
+       member = find_qgroup_rb(fs_info, memberid);
+       parent = find_qgroup_rb(fs_info, parentid);
+       if (!member || !parent)
+               return -ENOENT;
+
+       list_for_each_entry(list, &member->groups, next_group) {
+               if (list->group == parent) {
+                       list_del(&list->next_group);
+                       list_del(&list->next_member);
+                       kfree(list);
+                       return 0;
+               }
+       }
+       return -ENOENT;
+}
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       struct btrfs_path *path = NULL;
+       struct extent_buffer *l;
+       int slot;
+       int ret = 0;
+       u64 flags = 0;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /* default this to quota off, in case no status key is found */
+       fs_info->qgroup_flags = 0;
+
+       /*
+        * pass 1: read status, all qgroup infos and limits
+        */
+       key.objectid = 0;
+       key.type = 0;
+       key.offset = 0;
+       ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+       if (ret)
+               goto out;
+
+       while (1) {
+               struct btrfs_qgroup *qgroup;
+
+               slot = path->slots[0];
+               l = path->nodes[0];
+               btrfs_item_key_to_cpu(l, &found_key, slot);
+
+               if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+                       struct btrfs_qgroup_status_item *ptr;
+
+                       ptr = btrfs_item_ptr(l, slot,
+                                            struct btrfs_qgroup_status_item);
+
+                       if (btrfs_qgroup_status_version(l, ptr) !=
+                           BTRFS_QGROUP_STATUS_VERSION) {
+                               printk(KERN_ERR
+                                "btrfs: old qgroup version, quota disabled\n");
+                               goto out;
+                       }
+                       if (btrfs_qgroup_status_generation(l, ptr) !=
+                           fs_info->generation) {
+                               flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                               printk(KERN_ERR
+                                       "btrfs: qgroup generation mismatch, "
+                                       "marked as inconsistent\n");
+                       }
+                       fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+                                                                         ptr);
+                       /* FIXME read scan element */
+                       goto next1;
+               }
+
+               if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+                   found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+                       goto next1;
+
+               qgroup = find_qgroup_rb(fs_info, found_key.offset);
+               if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+                   (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+                       printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
+                       flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               }
+               if (!qgroup) {
+                       qgroup = add_qgroup_rb(fs_info, found_key.offset);
+                       if (IS_ERR(qgroup)) {
+                               ret = PTR_ERR(qgroup);
+                               goto out;
+                       }
+               }
+               switch (found_key.type) {
+               case BTRFS_QGROUP_INFO_KEY: {
+                       struct btrfs_qgroup_info_item *ptr;
+
+                       ptr = btrfs_item_ptr(l, slot,
+                                            struct btrfs_qgroup_info_item);
+                       qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+                       qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+                       qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+                       qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+                       /* generation currently unused */
+                       break;
+               }
+               case BTRFS_QGROUP_LIMIT_KEY: {
+                       struct btrfs_qgroup_limit_item *ptr;
+
+                       ptr = btrfs_item_ptr(l, slot,
+                                            struct btrfs_qgroup_limit_item);
+                       qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+                       qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+                       qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+                       qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+                       qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+                       break;
+               }
+               }
+next1:
+               ret = btrfs_next_item(quota_root, path);
+               if (ret < 0)
+                       goto out;
+               if (ret)
+                       break;
+       }
+       btrfs_release_path(path);
+
+       /*
+        * pass 2: read all qgroup relations
+        */
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_RELATION_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+       if (ret)
+               goto out;
+       while (1) {
+               slot = path->slots[0];
+               l = path->nodes[0];
+               btrfs_item_key_to_cpu(l, &found_key, slot);
+
+               if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+                       goto next2;
+
+               if (found_key.objectid > found_key.offset) {
+                       /* parent <- member, not needed to build config */
+                       /* FIXME should we omit the key completely? */
+                       goto next2;
+               }
+
+               ret = add_relation_rb(fs_info, found_key.objectid,
+                                     found_key.offset);
+               if (ret)
+                       goto out;
+next2:
+               ret = btrfs_next_item(quota_root, path);
+               if (ret < 0)
+                       goto out;
+               if (ret)
+                       break;
+       }
+out:
+       fs_info->qgroup_flags |= flags;
+       if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
+               fs_info->quota_enabled = 0;
+               fs_info->pending_quota_state = 0;
+       }
+       btrfs_free_path(path);
+
+       return ret < 0 ? ret : 0;
+}
+
+/*
+ * This is only called from close_ctree() or open_ctree(), both in single-
+ * treaded paths. Clean up the in-memory structures. No locking needed.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+       struct rb_node *n;
+       struct btrfs_qgroup *qgroup;
+       struct btrfs_qgroup_list *list;
+
+       while ((n = rb_first(&fs_info->qgroup_tree))) {
+               qgroup = rb_entry(n, struct btrfs_qgroup, node);
+               rb_erase(n, &fs_info->qgroup_tree);
+
+               WARN_ON(!list_empty(&qgroup->dirty));
+
+               while (!list_empty(&qgroup->groups)) {
+                       list = list_first_entry(&qgroup->groups,
+                                               struct btrfs_qgroup_list,
+                                               next_group);
+                       list_del(&list->next_group);
+                       list_del(&list->next_member);
+                       kfree(list);
+               }
+
+               while (!list_empty(&qgroup->members)) {
+                       list = list_first_entry(&qgroup->members,
+                                               struct btrfs_qgroup_list,
+                                               next_member);
+                       list_del(&list->next_group);
+                       list_del(&list->next_member);
+                       kfree(list);
+               }
+               kfree(qgroup);
+       }
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *quota_root,
+                                   u64 src, u64 dst)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = src;
+       key.type = BTRFS_QGROUP_RELATION_KEY;
+       key.offset = dst;
+
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+       btrfs_mark_buffer_dirty(path->nodes[0]);
+
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *quota_root,
+                                   u64 src, u64 dst)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = src;
+       key.type = BTRFS_QGROUP_RELATION_KEY;
+       key.offset = dst;
+
+       ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, quota_root, path);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *quota_root, u64 qgroupid)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_qgroup_info_item *qgroup_info;
+       struct btrfs_qgroup_limit_item *qgroup_limit;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_INFO_KEY;
+       key.offset = qgroupid;
+
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+                                     sizeof(*qgroup_info));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_qgroup_info_item);
+       btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+       btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+       btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+       btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+       btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+       btrfs_mark_buffer_dirty(leaf);
+
+       btrfs_release_path(path);
+
+       key.type = BTRFS_QGROUP_LIMIT_KEY;
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+                                     sizeof(*qgroup_limit));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+                                 struct btrfs_qgroup_limit_item);
+       btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+       btrfs_mark_buffer_dirty(leaf);
+
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *quota_root, u64 qgroupid)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_INFO_KEY;
+       key.offset = qgroupid;
+       ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, quota_root, path);
+       if (ret)
+               goto out;
+
+       btrfs_release_path(path);
+
+       key.type = BTRFS_QGROUP_LIMIT_KEY;
+       ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, u64 qgroupid,
+                                   u64 flags, u64 max_rfer, u64 max_excl,
+                                   u64 rsv_rfer, u64 rsv_excl)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *l;
+       struct btrfs_qgroup_limit_item *qgroup_limit;
+       int ret;
+       int slot;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_LIMIT_KEY;
+       key.offset = qgroupid;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret > 0)
+               ret = -ENOENT;
+
+       if (ret)
+               goto out;
+
+       l = path->nodes[0];
+       slot = path->slots[0];
+       qgroup_limit = btrfs_item_ptr(l, path->slots[0],
+                                     struct btrfs_qgroup_limit_item);
+       btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
+       btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
+       btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
+       btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
+       btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+
+       btrfs_mark_buffer_dirty(l);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_qgroup *qgroup)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *l;
+       struct btrfs_qgroup_info_item *qgroup_info;
+       int ret;
+       int slot;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_INFO_KEY;
+       key.offset = qgroup->qgroupid;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret > 0)
+               ret = -ENOENT;
+
+       if (ret)
+               goto out;
+
+       l = path->nodes[0];
+       slot = path->slots[0];
+       qgroup_info = btrfs_item_ptr(l, path->slots[0],
+                                struct btrfs_qgroup_info_item);
+       btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+       btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+       btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+       btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+       btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+       btrfs_mark_buffer_dirty(l);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
+                                    struct btrfs_fs_info *fs_info,
+                                   struct btrfs_root *root)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *l;
+       struct btrfs_qgroup_status_item *ptr;
+       int ret;
+       int slot;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_STATUS_KEY;
+       key.offset = 0;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret > 0)
+               ret = -ENOENT;
+
+       if (ret)
+               goto out;
+
+       l = path->nodes[0];
+       slot = path->slots[0];
+       ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+       btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+       btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+       /* XXX scan */
+
+       btrfs_mark_buffer_dirty(l);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret;
+
+       if (!root)
+               return -EINVAL;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       while (1) {
+               key.objectid = 0;
+               key.offset = 0;
+               key.type = 0;
+
+               path->leave_spinning = 1;
+               ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+               if (ret > 0) {
+                       if (path->slots[0] == 0)
+                               break;
+                       path->slots[0]--;
+               } else if (ret < 0) {
+                       break;
+               }
+
+               ret = btrfs_del_item(trans, root, path);
+               if (ret)
+                       goto out;
+               btrfs_release_path(path);
+       }
+       ret = 0;
+out:
+       root->fs_info->pending_quota_state = 0;
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_path *path = NULL;
+       struct btrfs_qgroup_status_item *ptr;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret = 0;
+
+       spin_lock(&fs_info->qgroup_lock);
+       if (fs_info->quota_root) {
+               fs_info->pending_quota_state = 1;
+               spin_unlock(&fs_info->qgroup_lock);
+               goto out;
+       }
+       spin_unlock(&fs_info->qgroup_lock);
+
+       /*
+        * initially create the quota tree
+        */
+       quota_root = btrfs_create_tree(trans, fs_info,
+                                      BTRFS_QUOTA_TREE_OBJECTID);
+       if (IS_ERR(quota_root)) {
+               ret =  PTR_ERR(quota_root);
+               goto out;
+       }
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_STATUS_KEY;
+       key.offset = 0;
+
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+                                     sizeof(*ptr));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       ptr = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_qgroup_status_item);
+       btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+       btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+       fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+                               BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+       btrfs_set_qgroup_status_scan(leaf, ptr, 0);
+
+       btrfs_mark_buffer_dirty(leaf);
+
+       spin_lock(&fs_info->qgroup_lock);
+       fs_info->quota_root = quota_root;
+       fs_info->pending_quota_state = 1;
+       spin_unlock(&fs_info->qgroup_lock);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *tree_root = fs_info->tree_root;
+       struct btrfs_root *quota_root;
+       int ret = 0;
+
+       spin_lock(&fs_info->qgroup_lock);
+       fs_info->quota_enabled = 0;
+       fs_info->pending_quota_state = 0;
+       quota_root = fs_info->quota_root;
+       fs_info->quota_root = NULL;
+       btrfs_free_qgroup_config(fs_info);
+       spin_unlock(&fs_info->qgroup_lock);
+
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = btrfs_clean_quota_tree(trans, quota_root);
+       if (ret)
+               goto out;
+
+       ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+       if (ret)
+               goto out;
+
+       list_del(&quota_root->dirty_list);
+
+       btrfs_tree_lock(quota_root->node);
+       clean_tree_block(trans, tree_root, quota_root->node);
+       btrfs_tree_unlock(quota_root->node);
+       btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+
+       free_extent_buffer(quota_root->node);
+       free_extent_buffer(quota_root->commit_root);
+       kfree(quota_root);
+out:
+       return ret;
+}
+
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
+{
+       /* FIXME */
+       return 0;
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+       struct btrfs_root *quota_root;
+       int ret = 0;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = add_qgroup_relation_item(trans, quota_root, src, dst);
+       if (ret)
+               return ret;
+
+       ret = add_qgroup_relation_item(trans, quota_root, dst, src);
+       if (ret) {
+               del_qgroup_relation_item(trans, quota_root, src, dst);
+               return ret;
+       }
+
+       spin_lock(&fs_info->qgroup_lock);
+       ret = add_relation_rb(quota_root->fs_info, src, dst);
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+       struct btrfs_root *quota_root;
+       int ret = 0;
+       int err;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = del_qgroup_relation_item(trans, quota_root, src, dst);
+       err = del_qgroup_relation_item(trans, quota_root, dst, src);
+       if (err && !ret)
+               ret = err;
+
+       spin_lock(&fs_info->qgroup_lock);
+       del_relation_rb(fs_info, src, dst);
+
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_qgroup *qgroup;
+       int ret = 0;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = add_qgroup_item(trans, quota_root, qgroupid);
+
+       spin_lock(&fs_info->qgroup_lock);
+       qgroup = add_qgroup_rb(fs_info, qgroupid);
+       spin_unlock(&fs_info->qgroup_lock);
+
+       if (IS_ERR(qgroup))
+               ret = PTR_ERR(qgroup);
+
+       return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+       struct btrfs_root *quota_root;
+       int ret = 0;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = del_qgroup_item(trans, quota_root, qgroupid);
+
+       spin_lock(&fs_info->qgroup_lock);
+       del_qgroup_rb(quota_root->fs_info, qgroupid);
+
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info, u64 qgroupid,
+                      struct btrfs_qgroup_limit *limit)
+{
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       struct btrfs_qgroup *qgroup;
+       int ret = 0;
+
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
+                                      limit->flags, limit->max_rfer,
+                                      limit->max_excl, limit->rsv_rfer,
+                                      limit->rsv_excl);
+       if (ret) {
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               printk(KERN_INFO "unable to update quota limit for %llu\n",
+                      (unsigned long long)qgroupid);
+       }
+
+       spin_lock(&fs_info->qgroup_lock);
+
+       qgroup = find_qgroup_rb(fs_info, qgroupid);
+       if (!qgroup) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+       qgroup->lim_flags = limit->flags;
+       qgroup->max_rfer = limit->max_rfer;
+       qgroup->max_excl = limit->max_excl;
+       qgroup->rsv_rfer = limit->rsv_rfer;
+       qgroup->rsv_excl = limit->rsv_excl;
+
+unlock:
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+                        struct btrfs_qgroup *qgroup)
+{
+       if (list_empty(&qgroup->dirty))
+               list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+/*
+ * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
+ * the modification into a list that's later used by btrfs_end_transaction to
+ * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ */
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_node *node,
+                           struct btrfs_delayed_extent_op *extent_op)
+{
+       struct qgroup_update *u;
+
+       BUG_ON(!trans->delayed_ref_elem.seq);
+       u = kmalloc(sizeof(*u), GFP_NOFS);
+       if (!u)
+               return -ENOMEM;
+
+       u->node = node;
+       u->extent_op = extent_op;
+       list_add_tail(&u->list, &trans->qgroup_ref_list);
+
+       return 0;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info,
+                            struct btrfs_delayed_ref_node *node,
+                            struct btrfs_delayed_extent_op *extent_op)
+{
+       struct btrfs_key ins;
+       struct btrfs_root *quota_root;
+       u64 ref_root;
+       struct btrfs_qgroup *qgroup;
+       struct ulist_node *unode;
+       struct ulist *roots = NULL;
+       struct ulist *tmp = NULL;
+       struct ulist_iterator uiter;
+       u64 seq;
+       int ret = 0;
+       int sgn;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       BUG_ON(!fs_info->quota_root);
+
+       ins.objectid = node->bytenr;
+       ins.offset = node->num_bytes;
+       ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+       if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+           node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+               struct btrfs_delayed_tree_ref *ref;
+               ref = btrfs_delayed_node_to_tree_ref(node);
+               ref_root = ref->root;
+       } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+                  node->type == BTRFS_SHARED_DATA_REF_KEY) {
+               struct btrfs_delayed_data_ref *ref;
+               ref = btrfs_delayed_node_to_data_ref(node);
+               ref_root = ref->root;
+       } else {
+               BUG();
+       }
+
+       if (!is_fstree(ref_root)) {
+               /*
+                * non-fs-trees are not being accounted
+                */
+               return 0;
+       }
+
+       switch (node->action) {
+       case BTRFS_ADD_DELAYED_REF:
+       case BTRFS_ADD_DELAYED_EXTENT:
+               sgn = 1;
+               break;
+       case BTRFS_DROP_DELAYED_REF:
+               sgn = -1;
+               break;
+       case BTRFS_UPDATE_DELAYED_HEAD:
+               return 0;
+       default:
+               BUG();
+       }
+
+       /*
+        * the delayed ref sequence number we pass depends on the direction of
+        * the operation. for add operations, we pass (node->seq - 1) to skip
+        * the delayed ref's current sequence number, because we need the state
+        * of the tree before the add operation. for delete operations, we pass
+        * (node->seq) to include the delayed ref's current sequence number,
+        * because we need the state of the tree after the delete operation.
+        */
+       ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
+                                  sgn > 0 ? node->seq - 1 : node->seq, &roots);
+       if (ret < 0)
+               goto out;
+
+       spin_lock(&fs_info->qgroup_lock);
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               goto unlock;
+
+       qgroup = find_qgroup_rb(fs_info, ref_root);
+       if (!qgroup)
+               goto unlock;
+
+       /*
+        * step 1: for each old ref, visit all nodes once and inc refcnt
+        */
+       tmp = ulist_alloc(GFP_ATOMIC);
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+       seq = fs_info->qgroup_seq;
+       fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(roots, &uiter))) {
+               struct ulist_node *tmp_unode;
+               struct ulist_iterator tmp_uiter;
+               struct btrfs_qgroup *qg;
+
+               qg = find_qgroup_rb(fs_info, unode->val);
+               if (!qg)
+                       continue;
+
+               ulist_reinit(tmp);
+                                               /* XXX id not needed */
+               ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+               ULIST_ITER_INIT(&tmp_uiter);
+               while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+                       struct btrfs_qgroup_list *glist;
+
+                       qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                       if (qg->refcnt < seq)
+                               qg->refcnt = seq + 1;
+                       else
+                               ++qg->refcnt;
+
+                       list_for_each_entry(glist, &qg->groups, next_group) {
+                               ulist_add(tmp, glist->group->qgroupid,
+                                         (unsigned long)glist->group,
+                                         GFP_ATOMIC);
+                       }
+               }
+       }
+
+       /*
+        * step 2: walk from the new root
+        */
+       ulist_reinit(tmp);
+       ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(tmp, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct btrfs_qgroup_list *glist;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+               if (qg->refcnt < seq) {
+                       /* not visited by step 1 */
+                       qg->rfer += sgn * node->num_bytes;
+                       qg->rfer_cmpr += sgn * node->num_bytes;
+                       if (roots->nnodes == 0) {
+                               qg->excl += sgn * node->num_bytes;
+                               qg->excl_cmpr += sgn * node->num_bytes;
+                       }
+                       qgroup_dirty(fs_info, qg);
+               }
+               WARN_ON(qg->tag >= seq);
+               qg->tag = seq;
+
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ulist_add(tmp, glist->group->qgroupid,
+                                 (unsigned long)glist->group, GFP_ATOMIC);
+               }
+       }
+
+       /*
+        * step 3: walk again from old refs
+        */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(roots, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct ulist_node *tmp_unode;
+               struct ulist_iterator tmp_uiter;
+
+               qg = find_qgroup_rb(fs_info, unode->val);
+               if (!qg)
+                       continue;
+
+               ulist_reinit(tmp);
+               ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+               ULIST_ITER_INIT(&tmp_uiter);
+               while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+                       struct btrfs_qgroup_list *glist;
+
+                       qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                       if (qg->tag == seq)
+                               continue;
+
+                       if (qg->refcnt - seq == roots->nnodes) {
+                               qg->excl -= sgn * node->num_bytes;
+                               qg->excl_cmpr -= sgn * node->num_bytes;
+                               qgroup_dirty(fs_info, qg);
+                       }
+
+                       list_for_each_entry(glist, &qg->groups, next_group) {
+                               ulist_add(tmp, glist->group->qgroupid,
+                                         (unsigned long)glist->group,
+                                         GFP_ATOMIC);
+                       }
+               }
+       }
+       ret = 0;
+unlock:
+       spin_unlock(&fs_info->qgroup_lock);
+out:
+       ulist_free(roots);
+       ulist_free(tmp);
+
+       return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed qgroups to disk.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+                     struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       int ret = 0;
+
+       if (!quota_root)
+               goto out;
+
+       fs_info->quota_enabled = fs_info->pending_quota_state;
+
+       spin_lock(&fs_info->qgroup_lock);
+       while (!list_empty(&fs_info->dirty_qgroups)) {
+               struct btrfs_qgroup *qgroup;
+               qgroup = list_first_entry(&fs_info->dirty_qgroups,
+                                         struct btrfs_qgroup, dirty);
+               list_del_init(&qgroup->dirty);
+               spin_unlock(&fs_info->qgroup_lock);
+               ret = update_qgroup_info_item(trans, quota_root, qgroup);
+               if (ret)
+                       fs_info->qgroup_flags |=
+                                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               spin_lock(&fs_info->qgroup_lock);
+       }
+       if (fs_info->quota_enabled)
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+       else
+               fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+       spin_unlock(&fs_info->qgroup_lock);
+
+       ret = update_qgroup_status_item(trans, fs_info, quota_root);
+       if (ret)
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+
+out:
+
+       return ret;
+}
+
+/*
+ * copy the acounting information between qgroups. This is necessary when a
+ * snapshot or a subvolume is created
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+                        struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+                        struct btrfs_qgroup_inherit *inherit)
+{
+       int ret = 0;
+       int i;
+       u64 *i_qgroups;
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       struct btrfs_qgroup *srcgroup;
+       struct btrfs_qgroup *dstgroup;
+       u32 level_size = 0;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       if (!quota_root)
+               return -EINVAL;
+
+       /*
+        * create a tracking group for the subvol itself
+        */
+       ret = add_qgroup_item(trans, quota_root, objectid);
+       if (ret)
+               goto out;
+
+       if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+               ret = update_qgroup_limit_item(trans, quota_root, objectid,
+                                              inherit->lim.flags,
+                                              inherit->lim.max_rfer,
+                                              inherit->lim.max_excl,
+                                              inherit->lim.rsv_rfer,
+                                              inherit->lim.rsv_excl);
+               if (ret)
+                       goto out;
+       }
+
+       if (srcid) {
+               struct btrfs_root *srcroot;
+               struct btrfs_key srckey;
+               int srcroot_level;
+
+               srckey.objectid = srcid;
+               srckey.type = BTRFS_ROOT_ITEM_KEY;
+               srckey.offset = (u64)-1;
+               srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
+               if (IS_ERR(srcroot)) {
+                       ret = PTR_ERR(srcroot);
+                       goto out;
+               }
+
+               rcu_read_lock();
+               srcroot_level = btrfs_header_level(srcroot->node);
+               level_size = btrfs_level_size(srcroot, srcroot_level);
+               rcu_read_unlock();
+       }
+
+       /*
+        * add qgroup to all inherited groups
+        */
+       if (inherit) {
+               i_qgroups = (u64 *)(inherit + 1);
+               for (i = 0; i < inherit->num_qgroups; ++i) {
+                       ret = add_qgroup_relation_item(trans, quota_root,
+                                                      objectid, *i_qgroups);
+                       if (ret)
+                               goto out;
+                       ret = add_qgroup_relation_item(trans, quota_root,
+                                                      *i_qgroups, objectid);
+                       if (ret)
+                               goto out;
+                       ++i_qgroups;
+               }
+       }
+
+
+       spin_lock(&fs_info->qgroup_lock);
+
+       dstgroup = add_qgroup_rb(fs_info, objectid);
+       if (!dstgroup)
+               goto unlock;
+
+       if (srcid) {
+               srcgroup = find_qgroup_rb(fs_info, srcid);
+               if (!srcgroup)
+                       goto unlock;
+               dstgroup->rfer = srcgroup->rfer - level_size;
+               dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+               srcgroup->excl = level_size;
+               srcgroup->excl_cmpr = level_size;
+               qgroup_dirty(fs_info, dstgroup);
+               qgroup_dirty(fs_info, srcgroup);
+       }
+
+       if (!inherit)
+               goto unlock;
+
+       i_qgroups = (u64 *)(inherit + 1);
+       for (i = 0; i < inherit->num_qgroups; ++i) {
+               ret = add_relation_rb(quota_root->fs_info, objectid,
+                                     *i_qgroups);
+               if (ret)
+                       goto unlock;
+               ++i_qgroups;
+       }
+
+       for (i = 0; i <  inherit->num_ref_copies; ++i) {
+               struct btrfs_qgroup *src;
+               struct btrfs_qgroup *dst;
+
+               src = find_qgroup_rb(fs_info, i_qgroups[0]);
+               dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+               if (!src || !dst) {
+                       ret = -EINVAL;
+                       goto unlock;
+               }
+
+               dst->rfer = src->rfer - level_size;
+               dst->rfer_cmpr = src->rfer_cmpr - level_size;
+               i_qgroups += 2;
+       }
+       for (i = 0; i <  inherit->num_excl_copies; ++i) {
+               struct btrfs_qgroup *src;
+               struct btrfs_qgroup *dst;
+
+               src = find_qgroup_rb(fs_info, i_qgroups[0]);
+               dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+               if (!src || !dst) {
+                       ret = -EINVAL;
+                       goto unlock;
+               }
+
+               dst->excl = src->excl + level_size;
+               dst->excl_cmpr = src->excl_cmpr + level_size;
+               i_qgroups += 2;
+       }
+
+unlock:
+       spin_unlock(&fs_info->qgroup_lock);
+out:
+       return ret;
+}
+
+/*
+ * reserve some space for a qgroup and all its parents. The reservation takes
+ * place with start_transaction or dealloc_reserve, similar to ENOSPC
+ * accounting. If not enough space is available, EDQUOT is returned.
+ * We assume that the requested space is new for all qgroups.
+ */
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_qgroup *qgroup;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u64 ref_root = root->root_key.objectid;
+       int ret = 0;
+       struct ulist *ulist = NULL;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+
+       if (!is_fstree(ref_root))
+               return 0;
+
+       if (num_bytes == 0)
+               return 0;
+
+       spin_lock(&fs_info->qgroup_lock);
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               goto out;
+
+       qgroup = find_qgroup_rb(fs_info, ref_root);
+       if (!qgroup)
+               goto out;
+
+       /*
+        * in a first step, we check all affected qgroups if any limits would
+        * be exceeded
+        */
+       ulist = ulist_alloc(GFP_ATOMIC);
+       ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(ulist, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct btrfs_qgroup_list *glist;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+
+               if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+                   qg->reserved + qg->rfer + num_bytes >
+                   qg->max_rfer)
+                       ret = -EDQUOT;
+
+               if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+                   qg->reserved + qg->excl + num_bytes >
+                   qg->max_excl)
+                       ret = -EDQUOT;
+
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ulist_add(ulist, glist->group->qgroupid,
+                                 (unsigned long)glist->group, GFP_ATOMIC);
+               }
+       }
+       if (ret)
+               goto out;
+
+       /*
+        * no limits exceeded, now record the reservation into all qgroups
+        */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(ulist, &uiter))) {
+               struct btrfs_qgroup *qg;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+
+               qg->reserved += num_bytes;
+       }
+
+out:
+       spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(ulist);
+
+       return ret;
+}
+
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_qgroup *qgroup;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct ulist *ulist = NULL;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       u64 ref_root = root->root_key.objectid;
+
+       if (!is_fstree(ref_root))
+               return;
+
+       if (num_bytes == 0)
+               return;
+
+       spin_lock(&fs_info->qgroup_lock);
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               goto out;
+
+       qgroup = find_qgroup_rb(fs_info, ref_root);
+       if (!qgroup)
+               goto out;
+
+       ulist = ulist_alloc(GFP_ATOMIC);
+       ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(ulist, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct btrfs_qgroup_list *glist;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+
+               qg->reserved -= num_bytes;
+
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ulist_add(ulist, glist->group->qgroupid,
+                                 (unsigned long)glist->group, GFP_ATOMIC);
+               }
+       }
+
+out:
+       spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(ulist);
+}
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
+{
+       if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
+               return;
+       printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+               trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
+               trans->delayed_ref_elem.seq);
+       BUG();
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 23cbda0685b800d14a2deb9fc1ea24da9d14d937..0d6c8816845a2bd7c19bffcbeedd2be6151e670a 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -351,6 +351,8 @@ again:
         h->block_rsv = NULL;
         h->orig_rsv = NULL;
         h->aborted = 0;
+       h->delayed_ref_elem.seq = 0;
+       INIT_LIST_HEAD(&h->qgroup_ref_list);
  
         smp_mb();
         if (cur_trans->blocked && may_wait_transaction(root, type)) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index 010729446e130437bc27a94fa6618a4529cb52d7..16ba00842c3852c7a1a9c3fb4f88270cb4d7c5fa 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -20,6 +20,7 @@
  #define __BTRFS_TRANSACTION__
  #include "btrfs_inode.h"
  #include "delayed-ref.h"
+#include "ctree.h"
  
  struct btrfs_transaction {
         u64 transid;
@@ -63,6 +64,8 @@ struct btrfs_trans_handle {
          * Subvolume quota depends on this
          */
         struct btrfs_root *root;
+       struct seq_list delayed_ref_elem;
+       struct list_head qgroup_ref_list;
  };
  
  struct btrfs_pending_snapshot {
author	Arne Jansen <sensille@gmx.net>
	Thu, 28 Jun 2012 16:03:02 +0000 (18:03 +0200)
committer	Jan Schmidt <list.btrfs@jan-o-sch.net>
	Thu, 12 Jul 2012 08:54:21 +0000 (10:54 +0200)
fs/btrfs/Makefile		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/ioctl.h		patch \| blob \| history
fs/btrfs/qgroup.c	[new file with mode: 0644]	patch \| blob
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/transaction.h		patch \| blob \| history