Sunday, February 11, 2018

Lots of new stuff I learned while having my Bloom-filter code reviewed

TIL's

  • It is possible to use assert, e.g. for checking pre-conditions
  • vector-of performs better, but it is not usable with transient data structures
  • destructuring is a really convenient way to access values from a map; e.g. (defn bloom-add [{:keys [hash-functions bits] :as bloom-filter} value]
  • reduced can be used to jump out of a `reduce` call: simply return `(reduced )` if we need to jump out
  • ifn? can be used to check if a variable represents a function

The question I asked.

My original code.


; core.clj

(ns bloom-filter.core)


(defn bloom-create [numbits hash-functions]
      (if (or (nil? numbits) (nil? hash-functions)) nil
          {:bits (vec (repeat numbits 0)) :hash-functions hash-functions}))

(defn bloom-add [bloom-filter value]
      (when-not (nil? bloom-filter) 
      (let [hash-functions (:hash-functions bloom-filter)
            bits           (:bits bloom-filter)
            new-bits (reduce (fn [actual-bits hash-function] (assoc actual-bits (hash-function value) 1)) bits hash-functions)]
      (assoc-in bloom-filter [:bits] new-bits))))

(defn bloom-contains [bloom-filter value] 
      (let [hash-functions (:hash-functions bloom-filter)
            bits (:bits bloom-filter)]
      (reduce (fn [actual-value hash-function] (and actual-value (= 1 (bits (hash-function value))))) true hash-functions)))

; core-test.clj

(ns bloom-filter.core-test

(:require [clojure.test :refer :all]
          [bloom-filter.core :refer :all]))

(defn mod7-fun [num] (mod num 7))
(defn always-zero-fun [dontcare] 0)  

(deftest create-test
  (testing "create bloom filter"
    (is (= nil (bloom-create nil nil)))
    (is (= nil (bloom-create 0 nil)))
    (is (= nil (bloom-create nil [])))
    (is (= {:bits [] :hash-functions []} (bloom-create 0 [])))
    (is (= {:bits [0] :hash-functions []} (bloom-create 1 [])))
    (is (= {:bits [] :hash-functions [always-zero-fun]} (bloom-create 0 [always-zero-fun])))
))

(deftest add-test
  (let [empty-filter (bloom-create 7 [])
        single-fun-filter (bloom-create 7 [mod7-fun])
        two-fun-filter (bloom-create 7 [mod7-fun always-zero-fun])]
  (testing "add to bloom filter"
    (is (= nil (bloom-add nil 3)))
    (is (= empty-filter (bloom-add empty-filter nil)))
    (is (= empty-filter (bloom-add empty-filter 10)))
    (is (= {:bits [0 0 0 1 0 0 0] :hash-functions [mod7-fun]}
           (bloom-add single-fun-filter 3)))
    (is (= {:bits [1 0 0 1 0 0 0] :hash-functions [mod7-fun always-zero-fun]}
           (bloom-add two-fun-filter 3)))
)))

(deftest contains-test
  (let [empty-filter (bloom-create 7 [])
        simple-filter (bloom-create 7 [mod7-fun])
        filter-with-element (bloom-add simple-filter 3)]
  (testing "bloom filter contains"
    (is (true? (bloom-contains empty-filter 0)))
    (is (false? (bloom-contains simple-filter 0)))
    (is (true? (bloom-contains filter-with-element 3)))
    (is (true? (bloom-contains filter-with-element 10)))
)))

The improved code

; core.clj

(ns bloom-filter.core)


(defn bloom-create [numbits hash-functions]
      (assert (number? numbits) "numbits must be numeric")
      (assert (not (nil? hash-functions)) "hash-functions must not be nil")
      (assert (every? ifn? hash-functions) "hash-functions must be a collection of functions")
      {:bits (vec (repeat numbits false)) 
       :hash-functions hash-functions})

(def ^:private safe-hash-functions (memoize 
      (fn [numbits hash-functions]
      (map #(fn [x] (mod (% x) numbits)) hash-functions))))

(defn bloom-add [{:keys [hash-functions bits] :as bloom-filter} value]
      (when bloom-filter 
      (let [hash-functions (safe-hash-functions (count bits) hash-functions)]
      (->> hash-functions
           (reduce (fn [actual-bits hash-function] (assoc! actual-bits (hash-function value) true)) (transient bits))
           (persistent!)
           (assoc bloom-filter :bits)))))

(defn bloom-contains? [{:keys [hash-functions bits] :as bloom-filter} value] 
      (when bloom-filter
      (let [hash-functions (safe-hash-functions (count bits) hash-functions)]
(every? #(bits (% value)) hash-functions))))

; core-test.clj

(ns bloom-filter.core-test

(:require [clojure.test :refer :all]
          [bloom-filter.core :refer :all]))

(def ^:const F false)
(def ^:const T true)

(defn mod7-fun [num] (mod num 7))
(defn always-zero-fun [dontcare] 0)  
(defn always-ten-fun [dontcare] 10)  

(def empty-filter (bloom-create 7 []))
(def single-fun-filter (bloom-create 7 [mod7-fun]))
(def too-high-range-fun-filter (bloom-create 7 [always-ten-fun]))

(deftest create-test
  (testing "create bloom filter"
    (is (thrown-with-msg? AssertionError #"numbits must be numeric" (bloom-create nil nil)))
    (is (thrown-with-msg? AssertionError #"numbits must be numeric" (bloom-create "a" nil)))
    (is (thrown-with-msg? AssertionError #"hash-functions must not be nil" (bloom-create 0 nil)))
    (is (thrown-with-msg? AssertionError #"hash-functions must be a collection of functions" (bloom-create 0 "a")))
    (is (thrown-with-msg? AssertionError #"hash-functions must be a collection of functions" (bloom-create 0 ["a"])))
    (is (thrown-with-msg? AssertionError #"numbits must be numeric" (bloom-create nil [])))
    (is (= {:bits [] :hash-functions []} (bloom-create 0 [])))
    (is (= {:bits [F] :hash-functions []} (bloom-create 1 [])))
    (is (= {:bits [] :hash-functions [always-zero-fun]} (bloom-create 0 [always-zero-fun])))
))

(deftest add-test
  (let [two-fun-filter (bloom-create 7 [mod7-fun always-zero-fun])]
  (testing "add to bloom filter"
    (is (= nil (bloom-add nil 3)))
    (is (= empty-filter (bloom-add empty-filter nil)))
    (is (= empty-filter (bloom-add empty-filter 10)))
    (is (= [F F F T F F F] (:bits (bloom-add single-fun-filter 3))))
    (is (= [T F F T F F F] (:bits (bloom-add two-fun-filter 3))))
    (is (= [F F F T F F F] (:bits (bloom-add too-high-range-fun-filter 3))))
)))

(deftest contains-test
  (let [filter-with-element (bloom-add single-fun-filter 3)
        filter-too-high-range-with-element (bloom-add too-high-range-fun-filter 3)]
  (testing "bloom filter contains"
    (is (= nil (bloom-contains? nil 0)))
    (is (true? (bloom-contains? empty-filter 0)))
    (is (false? (bloom-contains? single-fun-filter 0)))
    (is (true? (bloom-contains? filter-with-element 3)))
    (is (true? (bloom-contains? filter-with-element 10)))
    (is (true? (bloom-contains? filter-too-high-range-with-element 3)))
)))

No comments:

Post a Comment