Let's look at another way to implement ordered sets. Here is an ordered set signature that is designed to support implementation of both set and map abstractions.
signature ORDERED_FUNCTIONAL_SET = sig (* Overview: a "set" is a set of distinct elements of type * "elem". Each element is identified by a unique key, which * may be the same as the element itself. Two elements are * considered distinct if they have different keys. * Keys are a totally ordered set. * * A set can be used to represent an ordinary set if key = elem. * It can be used to represent a mapping if elem = key * value. * * For example, if key and elem are int, then a set might be * {1,-11,0}, {}, or {1001}. If key is string and elem is int, * a set could be {("elephant", 2), ("rhino", 25), ("zebra", 2)} *) type key type elem type set (* compare(k1,k2) reports the ordering of k1 and k2. *) val compare: key * key -> order (* keyOf(e) is the key of e. *) val keyOf: elem -> key (* empty is the empty set. *) val empty : unit -> set (* Effects: add(s,e) is s union {e}. Returns true * if e already in s, false otherwise. *) val add: set * elem -> set * bool (* remove(s,k) is (s',eo) where s' = s - {k} (set difference) * and eo is either SOME e if there is an e in s * where k is e's key, or NONE otherwise. *) val remove: set * key -> set * elem (* lookup(s,k) is SOME e where k = keyOf(e), or NONE if * the set contains no such e. *) val lookup: set * key -> elem option (* size(s) is the number of elements in s. *) val size: set -> int (* Ordered set operations *) (* first(s) is SOME of the element of s with the smallest key, * or NONE if s is empty. *) val first: set -> elem option (* last(s) is SOME of the element of s with the largest key, * or NONE if s is empty. *) val last: set -> elem (* A fold operation on ordered sets takes a key argument * that defines the element where the fold starts. *) type 'b folder = ((elem*'b)->'b) -> 'b -> key -> set -> 'b (* fold over the elements in key order. *) val fold_forward: 'b folder (* fold over the elements in reverse key order. *) val fold_backward: 'b folder end
We've added some operations to show the added power of ordered sets. The
first function gives the first element in the set, and fold_forward iterates
over the elements of the set in ascending order. We can similarly implement last
and fold_backward from the set signature.
We have already seen red-black trees, which are one good way to implement ordered sets. Here is an implementation of red-black trees that implements this set signature as a functor:
signature ORDERED_SET_PARAMS = sig type key type elem val keyOf: elem -> key val compare: key * key -> order end functor RedBlackTree(structure Params : ORDERED_SET_PARAMS) = struct type key = Params.key type elem = Params.elem val compare = Params.compare val keyOf = Params.keyOf datatype color = Red | Black datatype tree = Empty | Node of {color: color, value: elem, left: tree, right: tree} (* Representation invariant: * 0. All values in the left subtree are less than "value", and * all values in the right subtree are greater than "value". * 1. No red node has a red parent. * 2. Every path from the root to an empty node has the * same number of black nodes (the "black height"). *) fun lookup(t:tree,k:key) = case t of Empty => NONE | Node {color,value,left,right} => (case compare (k, keyOf(value)) of EQUAL => SOME value | LESS => lookup(left, k) | GREATER => lookup(right, k)) fun add(t: tree, e: elem): tree * bool = let (* Definition: a tree t satisfies the "reconstruction invariant" * if it is black and satisfies the rep invariant, or if it is * red and its children satisfy the rep invariant. *) (* makeBlack(t) is a tree that satisfies the rep invariant. * Requires: t satisfies the reconstruction invariant * Algorithm: Make a tree identical to t but with a black root. *) fun makeBlack (t:tree): tree = case t of Empty => Empty | Node {color,value,left,right} => Node {color=Black, value=value, left=left, right=right} (* Construct the result of a red-black tree rotation. *) fun rotate(x: elem, y: elem, z: elem, a: tree, b: tree, c:tree, d: tree): tree = Node {color=Red, value=y, left= Node {color=Black, value=x, left=a, right=b}, right=Node {color=Black, value=z, left=c, right=d}} (* balance(t) is a tree that satisfies the reconstruction * invariant and contains all the same values as t. * Requires: the children of t satisfy the reconstruction * invariant. *) fun balance (t:tree): tree = case t of (*1*) Node {color=Black, value=z, left=Node {color=Red, value=y, left=Node {color=Red, value=x, left=a, right=b}, right=c}, right=d} => rotate(x,y,z,a,b,c,d) | (*2*) Node {color=Black, value=z, left=Node {color=Red, value=x, left=a, right=Node {color=Red, value=y, left=b, right=c}}, right=d} => rotate(x,y,z,a,b,c,d) | (*3*) Node {color=Black, value=x, left=a, right=Node {color=Red, value=z, left=Node {color=Red, value=y, left=b, right=c}, right=d}} => rotate(x,y,z,a,b,c,d) | (*4*) Node {color=Black, value=x, left=a, right=Node {color=Red, value=y, left=b, right=Node {color=Red, value=z, left=c, right=d}}} => rotate(x,y,z,a,b,c,d) | _ => t (* Insert x into t, returning (t',b) where t' is a tree that * contains all the elements of t, plus e, and satisfies the * reconstruction invariant. b is true if t contains e already. *) fun walk (t:tree):tree * bool = case t of Empty => (Node {color=Red, value=e, left=Empty, right=Empty}, false) | Node {color,value,left,right} => (case compare (keyOf(value),keyOf(e)) of EQUAL => (Node {color=color,value=e, left=left,right=right}, true) | GREATER => let val (t',b) = walk(left) in (balance (Node {color=color, value=value, left=t', right=right}),b) end | LESS => let val (t',b) = walk(right) in (balance (Node {color=color, value=value, left=left, right=t'}),b) end) in let val (t',b) = walk(t) in (makeBlack(t'), b) end end fun first(t: tree): elem option = case t of Empty => NONE | Node{color, value, left, right} => case first(left) of NONE => SOME value | eo => eo fun fold_forward (f: elem*'b->'b) (b:'b) (k:key) (t:tree) = case t of Empty => b | Node {color,value,left,right} => (case compare(keyOf(value), k) of EQUAL => fold_forward f (f(value,b)) k right | LESS => fold_forward f b k right | GREATER => let val lft = fold_forward f b k left in fold_forward f (f(value,lft)) k right end) end
Here is how the red-black tree data structure can be packaged up as a set implementation. This implementation represents a set as a red-black tree plus an integer that keeps track of the total number of elements in the set. Otherwise there is no efficient way to implement the size operation.
functor RedBlackSet(structure Params: ORDERED_SET_PARAMS) :> ORDERED_FUNCTIONAL_SET where type key = Params.key and type elem = Params.elem = struct type key = Params.key type elem = Params.elem val compare = Params.compare val keyOf = Params.keyOf structure RBTree = RedBlackTree(structure Params = Params) type set = RBTree.tree * int fun empty() = (RBTree.Empty, 0) fun add((t,n),e) = let val (t',b) = RBTree.add(t,e) in if b then ((t',n),b) else ((t', n+1),b) end fun remove(s, k) = raise Fail "Not implemented: remove" fun lookup((t,n), k) = RBTree.lookup(t,k) fun size((t,n)) = n exception Empty fun first((t,n)) = RBTree.first(t) fun last((t,n)) = raise Fail "Not implemented: last" type 'b folder = ((elem*'b)->'b) -> 'b -> key -> set -> 'b fun fold_forward f b k (t,n) = RBTree.fold_forward f b k t fun fold_backward f b k s = raise Fail("Not implemented: fold") end
Red-black trees are nice because they guarantee O(lg n) insert, lookup, and deletion time, with good constant factors. However, if we are willing to accept probabilistic assurances of performance, there are other, simpler options for implementing ordered sets.
There are two well-known data structures for implementing ordered sets that use randomness to achieve good expected performance: skip lists and treaps. Treaps are simpler and probably faster, so we'll choose them.
The idea behind treaps is to fix binary search trees. Binary search trees are great as long as they are balanced. But if the elements of the tree are inserted in an ordered way, the tree can turn into a linked list (or at least become extremely unbalanced), leading to O(n) performance. On the other hand, if a set of elements is inserted in a random order, the expected distance in the tree to a randomly chosen element is O(lg n). To see why, imagine walking down the tree from the root to a leaf. At any given point on the walk, there is a subtree of (say) n elements below the current element. Suppose that we construct a sequence of all of the n elements in this subtree in key order. Because the elements were inserted in random order, the current element is randomly positioned at some position p within the ordered sequence, where p goes from 1 to n. If we are looking for a randomly chosen element, then there is a 1/n probability that the current element is the one of interest. The left subtree contains p-1 elements, so there is a (p-1)/n probability that the element of interest. Correspondingly, the right subtree contains (n-p) elements, and there is an (n-p)/n probability that the element is there. The expected size of the subtree that is visited after one step of the walk assuming position p is therefore (p-1)·(p-1)/n + (n-p)·(n-p)/n. All values of p from 1 to n are equally likely, so the expected size of the next subtree is therefore the sum of this expression for all p from 1 to n, divided by n:
Thus, each branch taken shrinks the size of the subtree below the current node by a factor of approximately 2/3. Therefore we expect to take O(lg n) steps to walk to a randomly chosen element; in fact, about log3/2n steps on average. Actually it will be a bit less because at any point along the walk we may encounter the node we are looking for and stop walking down.
Treaps simulate the construction of a randomly constructed binary search tree. Each node in a treap contains not only a value and pointers to the left and right children, but also a priority. The idea is that a treap always looks like the binary search tree you would get if you had inserted the elements in priority order. If the priorities are generated randomly, you have a random treap whose structure is the same as the corresponding random binary search tree. In an ordinary binary search tree, elements inserted later are always lower in the tree; therefore, the nodes in a treap must satisfy the heap ordering invariant on the node priorities. A treap is both a binary search tree with respect to the node elements, and a heap with respect to the node priorities. From this comes its name: "treap" = "tree heap".
Given a set of elements and associated priorities it is not completely obvious that we can construct a treap that satisfies both invariants simultaneously. Clearly the root of the treap must be the node with highest priority. To satisfy the BST invariant, all the nodes whose keys are less than this node must be in the left subtree of this node, and the nodes whose keys are greater must be in the right subtree. Therefore, we can apply this tree construction recursively to the left and right subtrees, resulting in a treap.
Given an existing treap, how do we insert a new element? The algorithm follows the same strategy as in red-black trees: it finds the unique leaf where the element can be inserted while preserving the BST invariant. However, we also assign this element a random priority. The final treap had better look like the binary search tree that one would get if the newly inserted element had been inserted according to its priority. This is achieved by performing a series of tree rotations to enforce the heap ordering invariant.
A simple tree rotation is also useful to know about for other tree algorithms such as splay trees and AVL trees. Notice that the following two trees both satisfy the binary search tree invariant, and that all of the elements remain in the same order with respect to an in-order traversal, regardless of the structure of the subtrees A, B, C:
x y
/ \ / \
A y x C
/ \ / \
B C A B
A tree rotation converts a part of the tree that looks like one of these into the other. The advantage is that the relative position of x and y is swapped by the rotation. Thus, if y is higher priority than x but it is below x, thus breaking the heap-ordering invariant (as in the left-hand picture), a tree rotation to the right-hand configuration will restore the heap-ordering invariant because it puts x below y.
Here is the code for Treaps:
functor Treap(structure Params: ORDERED_SET_PARAMS) = struct type key = Params.key type elem = Params.elem val compare = Params.compare val keyOf = Params.keyOf type prio = Rand.rand datatype tree = Empty | Node of {left: tree, right: tree, value: elem, priority: prio} type node = {left: tree, right: tree, value: elem, priority: prio} (* Rep Invariant: * For Node{value,priority,left,right}: * 0. Binary Search Tree: all of the values in the tree "left" have * keys are less than the key of "value", and all * of the values in "right" have keys greater than the key of * "value". * 1. Heap ordering: all of the priorities in the left and right * subtrees are at least as large as "priority". *) fun lookup(t:tree,k:key): elem option = case t of Empty => NONE | Node {value,priority,left,right} => (case compare (k, keyOf(value)) of EQUAL => SOME value | LESS => lookup(left, k) | GREATER => lookup(right, k)) fun add(t:tree, e: elem, p: prio): tree * bool = let (* Given a < xv < b < yv < c, heap_rotate(xv,xp,yv,yp,a,b,c) is * a node for a tree that satisfies the rep invariant and contains * all of the elements in question. *) fun heap_rotate(xv,xp, yv,yp, a: tree, b: tree, c: tree): node = if xp < yp then {value = xv, priority = xp, left = a, right = Node{value = yv, priority = yp, left = b, right = c}} else {value = yv, priority = yp, right = c, left = Node{value = xv, priority = xp, left = a, right = b}} fun add_node(t: tree, e: elem, p:prio): node * bool = case t of Empty => ({value=e, priority=p, left=Empty, right=Empty}, false) | Node{value, priority, left, right} => case compare(keyOf(e),keyOf(value)) of EQUAL => ({value=e, priority=priority, left=left, right=right}, true) | LESS => let val ({value=xv, priority=xp, left=a, right=b}, dup) = add_node(left, e, p) in (heap_rotate(xv, xp, value, priority, a, b, right), dup) end | GREATER => let val ({value=yv, priority=yp, left=b, right=c}, dup) = add_node(right, e, p) in (heap_rotate(value, priority, yv, yp, left, b, c), dup) end val (n, dup) = add_node(t,e,p) in (Node(n), dup) end fun first(t: tree): elem option = case t of Empty => NONE | Node{value, priority, left, right} => case first(left) of NONE => SOME value | eo => eo fun fold_forward(f: elem*'b->'b) (b:'b) (k:key) (t:tree) = case t of Empty => b | Node {value,priority,left,right} => (case compare(keyOf(value), k) of EQUAL => fold_forward f (f(value,b)) k right | LESS => fold_forward f b k right | GREATER => let val lft = fold_forward f b k left in fold_forward f (f(value,lft)) k right end) end
Here, heap_rotate is the function that figures out which of the two tree
configurations above is appropriate, given two elements x and y and their
associated priorities. This code doesn't actually build the tree nodes for the
result until it has to, resulting in some performance improvement. The function
add_node walks to the bottom of the tree, then uses heap_rotate as it
reconstructs the tree on the way back up so that the heap ordering invariant is
always maintained. Note that first and fold_forward work exactly the same way for
all binary trees.
This code assumes that a priority is provided when elements are added to the data structure. We want this priority to be randomly chosen from a large space so that the tree is likely to be approximately balanced. SML provides some library functions for generating pseudo-random numbers. For this use, it doesn't matter too much how good the pseudo-random number generator is. Here is how we can use a random number generator to produce random treaps, a good set implementation. We haven't implemented remove here, but it's done using rotations too.
functor TreapSet(structure Params: ORDERED_SET_PARAMS) :> ORDERED_FUNCTIONAL_SET where type key = Params.key and type elem = Params.elem = struct type key = Params.key type elem = Params.elem val compare = Params.compare val keyOf = Params.keyOf structure T = Treap(structure Params = Params) type set = {tree: T.tree, seed: Rand.rand, size: int} fun empty() = {tree = T.Empty, seed = 0wx5a5a5, size = 0} fun lookup({tree,seed,size}, k) = T.lookup(tree,k) fun add({tree,seed,size}, e:elem) = let val p = Rand.random(seed) val (t',dup) = T.add(tree,e,p) val size' = if dup then size else size+1 in ({tree=t', seed=p, size=size'}, dup) end fun size({tree,seed,size}) = size fun first({tree,seed,size}) = T.first(tree) fun remove(t,k) = raise Fail "Not implemented: treap remove" fun last(t) = raise Fail "Not implemented: last" type 'b folder = ((elem*'b)->'b) -> 'b -> key -> set -> 'b fun fold_forward (f: elem*'b->'b) (b:'b) (k:key) {tree,seed,size} = T.fold_forward f b k tree fun fold_backward f b k tr = (raise Fail "Not implemented: fold backward") end
The win of treaps is that the code is considerably simpler than red-black trees. Red-black trees are known for being fast, but this implementation of treaps is competitive in speed and a lot shorter and simpler.