};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
+static inline void *ptr_to_indirect(void *ptr)
+{
+ return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+}
+
+static inline void *indirect_to_ptr(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+}
+
static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
{
return root->gfp_mask & __GFP_BITS_MASK;
{
struct radix_tree_node *node =
container_of(head, struct radix_tree_node, rcu_head);
+ int i;
/*
* must only free zeroed nodes into the slab. radix_tree_shrink
* can leave us with a non-NULL entry in the first slot, so clear
* that here to make sure.
*/
- tag_clear(node, 0, 0);
- tag_clear(node, 1, 0);
+ for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
+ tag_clear(node, i, 0);
+
node->slots[0] = NULL;
node->count = 0;
return -ENOMEM;
/* Increase the height. */
- node->slots[0] = radix_tree_indirect_to_ptr(root->rnode);
+ node->slots[0] = indirect_to_ptr(root->rnode);
/* Propagate the aggregated tag info into the new root */
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
newheight = root->height+1;
node->height = newheight;
node->count = 1;
- node = radix_tree_ptr_to_indirect(node);
+ node = ptr_to_indirect(node);
rcu_assign_pointer(root->rnode, node);
root->height = newheight;
} while (height > root->height);
return error;
}
- slot = radix_tree_indirect_to_ptr(root->rnode);
+ slot = indirect_to_ptr(root->rnode);
height = root->height;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
rcu_assign_pointer(node->slots[offset], slot);
node->count++;
} else
- rcu_assign_pointer(root->rnode,
- radix_tree_ptr_to_indirect(slot));
+ rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
}
/* Go a level down */
return NULL;
return is_slot ? (void *)&root->rnode : node;
}
- node = radix_tree_indirect_to_ptr(node);
+ node = indirect_to_ptr(node);
height = node->height;
if (index > radix_tree_maxindex(height))
height--;
} while (height > 0);
- return is_slot ? (void *)slot:node;
+ return is_slot ? (void *)slot : indirect_to_ptr(node);
}
/**
height = root->height;
BUG_ON(index > radix_tree_maxindex(height));
- slot = radix_tree_indirect_to_ptr(root->rnode);
+ slot = indirect_to_ptr(root->rnode);
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
while (height > 0) {
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
pathp->node = NULL;
- slot = radix_tree_indirect_to_ptr(root->rnode);
+ slot = indirect_to_ptr(root->rnode);
while (height > 0) {
int offset;
if (!radix_tree_is_indirect_ptr(node))
return (index == 0);
- node = radix_tree_indirect_to_ptr(node);
+ node = indirect_to_ptr(node);
height = node->height;
if (index > radix_tree_maxindex(height))
* also settag. The function stops either after tagging nr_to_tag items or
* after reaching last_index.
*
+ * The tags must be set from the leaf level only and propagated back up the
+ * path to the root. We must do this so that we resolve the full path before
+ * setting any tags on intermediate nodes. If we set tags as we descend, then
+ * we can get to the leaf node and find that the index that has the iftag
+ * set is outside the range we are scanning. This reults in dangling tags and
+ * can lead to problems with later tag operations (e.g. livelocks on lookups).
+ *
* The function returns number of leaves where the tag was set and sets
* *first_indexp to the first unscanned index.
* WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
unsigned long nr_to_tag,
unsigned int iftag, unsigned int settag)
{
- unsigned int height = root->height, shift;
- unsigned long tagged = 0, index = *first_indexp;
- struct radix_tree_node *open_slots[height], *slot;
+ unsigned int height = root->height;
+ struct radix_tree_path path[height];
+ struct radix_tree_path *pathp = path;
+ struct radix_tree_node *slot;
+ unsigned int shift;
+ unsigned long tagged = 0;
+ unsigned long index = *first_indexp;
last_index = min(last_index, radix_tree_maxindex(height));
if (index > last_index)
}
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
- slot = radix_tree_indirect_to_ptr(root->rnode);
+ slot = indirect_to_ptr(root->rnode);
+
+ /*
+ * we fill the path from (root->height - 2) to 0, leaving the index at
+ * (root->height - 1) as a terminator. Zero the node in the terminator
+ * so that we can use this to end walk loops back up the path.
+ */
+ path[height - 1].node = NULL;
for (;;) {
int offset;
goto next;
if (!tag_get(slot, iftag, offset))
goto next;
+ if (height > 1) {
+ /* Go down one level */
+ height--;
+ shift -= RADIX_TREE_MAP_SHIFT;
+ path[height - 1].node = slot;
+ path[height - 1].offset = offset;
+ slot = slot->slots[offset];
+ continue;
+ }
+
+ /* tag the leaf */
+ tagged++;
tag_set(slot, settag, offset);
- if (height == 1) {
- tagged++;
- goto next;
+
+ /* walk back up the path tagging interior nodes */
+ pathp = &path[0];
+ while (pathp->node) {
+ /* stop if we find a node with the tag already set */
+ if (tag_get(pathp->node, settag, pathp->offset))
+ break;
+ tag_set(pathp->node, settag, pathp->offset);
+ pathp++;
}
- /* Go down one level */
- height--;
- shift -= RADIX_TREE_MAP_SHIFT;
- open_slots[height] = slot;
- slot = slot->slots[offset];
- continue;
+
next:
/* Go to next item at level determined by 'shift' */
index = ((index >> shift) + 1) << shift;
* last_index is guaranteed to be in the tree, what
* we do below cannot wander astray.
*/
- slot = open_slots[height];
+ slot = path[height - 1].node;
height++;
shift += RADIX_TREE_MAP_SHIFT;
}
results[0] = node;
return 1;
}
- node = radix_tree_indirect_to_ptr(node);
+ node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height);
slot = *(((void ***)results)[ret + i]);
if (!slot)
continue;
- results[ret + nr_found] = rcu_dereference_raw(slot);
+ results[ret + nr_found] =
+ indirect_to_ptr(rcu_dereference_raw(slot));
nr_found++;
}
ret += nr_found;
results[0] = (void **)&root->rnode;
return 1;
}
- node = radix_tree_indirect_to_ptr(node);
+ node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height);
results[0] = node;
return 1;
}
- node = radix_tree_indirect_to_ptr(node);
+ node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height);
slot = *(((void ***)results)[ret + i]);
if (!slot)
continue;
- results[ret + nr_found] = rcu_dereference_raw(slot);
+ results[ret + nr_found] =
+ indirect_to_ptr(rcu_dereference_raw(slot));
nr_found++;
}
ret += nr_found;
results[0] = (void **)&root->rnode;
return 1;
}
- node = radix_tree_indirect_to_ptr(node);
+ node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height);
void *newptr;
BUG_ON(!radix_tree_is_indirect_ptr(to_free));
- to_free = radix_tree_indirect_to_ptr(to_free);
+ to_free = indirect_to_ptr(to_free);
/*
* The candidate node has more than one child, or its child
/*
* We don't need rcu_assign_pointer(), since we are simply
- * moving the node from one part of the tree to another. If
- * it was safe to dereference the old pointer to it
+ * moving the node from one part of the tree to another: if it
+ * was safe to dereference the old pointer to it
* (to_free->slots[0]), it will be safe to dereference the new
- * one (root->rnode).
+ * one (root->rnode) as far as dependent read barriers go.
*/
newptr = to_free->slots[0];
if (root->height > 1)
- newptr = radix_tree_ptr_to_indirect(newptr);
+ newptr = ptr_to_indirect(newptr);
root->rnode = newptr;
root->height--;
+
+ /*
+ * We have a dilemma here. The node's slot[0] must not be
+ * NULLed in case there are concurrent lookups expecting to
+ * find the item. However if this was a bottom-level node,
+ * then it may be subject to the slot pointer being visible
+ * to callers dereferencing it. If item corresponding to
+ * slot[0] is subsequently deleted, these callers would expect
+ * their slot to become empty sooner or later.
+ *
+ * For example, lockless pagecache will look up a slot, deref
+ * the page pointer, and if the page is 0 refcount it means it
+ * was concurrently deleted from pagecache so try the deref
+ * again. Fortunately there is already a requirement for logic
+ * to retry the entire slot lookup -- the indirect pointer
+ * problem (replacing direct root node with an indirect pointer
+ * also results in a stale slot). So tag the slot as indirect
+ * to force callers to retry.
+ */
+ if (root->height == 0)
+ *((unsigned long *)&to_free->slots[0]) |=
+ RADIX_TREE_INDIRECT_PTR;
+
radix_tree_node_free(to_free);
}
}
root->rnode = NULL;
goto out;
}
- slot = radix_tree_indirect_to_ptr(slot);
+ slot = indirect_to_ptr(slot);
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
pathp->node = NULL;
radix_tree_node_free(to_free);
if (pathp->node->count) {
- if (pathp->node ==
- radix_tree_indirect_to_ptr(root->rnode))
+ if (pathp->node == indirect_to_ptr(root->rnode))
radix_tree_shrink(root);
goto out;
}