datacrypt-project · dgrnbrg · Sep 2, 2016 · Sep 5, 2016 · Sep 6, 2016 · Sep 6, 2016
diff --git a/src/hitchhiker/tracing_gc.clj b/src/hitchhiker/tracing_gc.clj
@@ -0,0 +1,69 @@
+(ns hitchhiker.tracing-gc
+  (:require [hitchhiker.tree.core :as hh]))
+
+;; Note: this implementation is single-threaded, and could be made parallel without too much effort
+
+;; We might need to trace millions or billions of keys. That might not fit in memory, so this could be backed
+;; by leveldb or hsql so that we can spill to disk when necessary. We don't need a functional datastructure here.
+(defprotocol IGCScratch
+  (add-to-work-queue! [this addr] "Adds the given address to the work queue to be processed")
+  (pop-from-work-queue! [this] "Pops the next element off of the work queue, or returns nil if we're done")
+  (observe-addr! [this addr] "Marks the given addr as being currently active")
+  (observed? [this addr] "Returns true if the given addr was observed"))
+;
+;;; The workq is a ref containing a collection of addresses we still need to scan.
+;;; The observed-set is a ref containing the set of addresses we know are active
+;;; For simplicity, adding an addr to the workq automatically observes it as well
+;;;   ^^ this allows us to only add new addrs to the workq, without a separate set of "in workq"
+(defrecord InMemScratch [workq observed-set]
+  IGCScratch
+  (add-to-work-queue! [_ addr]
+    (dosync
+      (when-not (contains? @observed-set addr)
+        (alter workq conj addr)
+        (alter observed-set conj addr))))
+  (pop-from-work-queue! [_]
+    (dosync
+      (when (seq @workq)
+        (let [head (peek @workq)]
+          (alter workq pop)
+          head))))
+  (observe-addr! [_ addr]
+    (dosync
+      (alter conj observed-set addr)))
+  (observed? [_ addr]
+    (contains? @observed-set addr)))
+
+(defn in-mem-scratch
+  "Creates an instance of in memory GC scratch"
+  []
+  (->InMemScratch [] #{}))
+
+(defn trace-gc!
+  "Does a tracing GC and frees up all unused keys.
+   This is a simple mark-sweep algorithm.
+
+   gc-scratch should be an instance of IGCScratch
+   gc-roots should be a list of the roots, which should implement IResolve. These are generated by calls to anchor-root.
+   all-keys should be a lazy sequence that will contain every key in storage. This algorithm will not hold the whole sequence in memory
+   delete-fn will be called on every key that should be deleted during the sweep phase"
+  [gc-scratch gc-roots all-keys delete-fn]
+  ;; First, we'll initialize the work queue
+  (doseq [root gc-roots]
+    (add-to-work-queue! gc-scratch root))
+  ;; Now, we'll do the mark phase
+  (loop []
+    (when-let [addr (pop-from-work-queue! gc-scratch)]
+      (observe-addr! gc-scratch addr)
+      (let [node (hh/resolve addr)]
+        (when (hh/index-node? node)
+          (doseq [c (:children node)]
+            (add-to-work-queue! gc-scratch c))))
+      (recur)))
+  ;; Next, we do the sweep
+  (loop [ks all-keys]
+    (when (seq ks)
+      (let [head (first ks)]
+        (when-not (observed? gc-scratch head)
+          (delete-fn head)))
+      (recur (next ks)))))