Sunday, February 11, 2018

Lots of new stuff I learned while having my Bloom-filter code reviewed

TIL's

  • It is possible to use assert, e.g. for checking pre-conditions
  • vector-of performs better, but it is not usable with transient data structures
  • destructuring is a really convenient way to access values from a map; e.g. (defn bloom-add [{:keys [hash-functions bits] :as bloom-filter} value]
  • reduced can be used to jump out of a `reduce` call: simply return `(reduced )` if we need to jump out
  • ifn? can be used to check if a variable represents a function

The question I asked.

My original code.


; core.clj

(ns bloom-filter.core)


(defn bloom-create [numbits hash-functions]
      (if (or (nil? numbits) (nil? hash-functions)) nil
          {:bits (vec (repeat numbits 0)) :hash-functions hash-functions}))

(defn bloom-add [bloom-filter value]
      (when-not (nil? bloom-filter) 
      (let [hash-functions (:hash-functions bloom-filter)
            bits           (:bits bloom-filter)
            new-bits (reduce (fn [actual-bits hash-function] (assoc actual-bits (hash-function value) 1)) bits hash-functions)]
      (assoc-in bloom-filter [:bits] new-bits))))

(defn bloom-contains [bloom-filter value] 
      (let [hash-functions (:hash-functions bloom-filter)
            bits (:bits bloom-filter)]
      (reduce (fn [actual-value hash-function] (and actual-value (= 1 (bits (hash-function value))))) true hash-functions)))

; core-test.clj

(ns bloom-filter.core-test

(:require [clojure.test :refer :all]
          [bloom-filter.core :refer :all]))

(defn mod7-fun [num] (mod num 7))
(defn always-zero-fun [dontcare] 0)  

(deftest create-test
  (testing "create bloom filter"
    (is (= nil (bloom-create nil nil)))
    (is (= nil (bloom-create 0 nil)))
    (is (= nil (bloom-create nil [])))
    (is (= {:bits [] :hash-functions []} (bloom-create 0 [])))
    (is (= {:bits [0] :hash-functions []} (bloom-create 1 [])))
    (is (= {:bits [] :hash-functions [always-zero-fun]} (bloom-create 0 [always-zero-fun])))
))

(deftest add-test
  (let [empty-filter (bloom-create 7 [])
        single-fun-filter (bloom-create 7 [mod7-fun])
        two-fun-filter (bloom-create 7 [mod7-fun always-zero-fun])]
  (testing "add to bloom filter"
    (is (= nil (bloom-add nil 3)))
    (is (= empty-filter (bloom-add empty-filter nil)))
    (is (= empty-filter (bloom-add empty-filter 10)))
    (is (= {:bits [0 0 0 1 0 0 0] :hash-functions [mod7-fun]}
           (bloom-add single-fun-filter 3)))
    (is (= {:bits [1 0 0 1 0 0 0] :hash-functions [mod7-fun always-zero-fun]}
           (bloom-add two-fun-filter 3)))
)))

(deftest contains-test
  (let [empty-filter (bloom-create 7 [])
        simple-filter (bloom-create 7 [mod7-fun])
        filter-with-element (bloom-add simple-filter 3)]
  (testing "bloom filter contains"
    (is (true? (bloom-contains empty-filter 0)))
    (is (false? (bloom-contains simple-filter 0)))
    (is (true? (bloom-contains filter-with-element 3)))
    (is (true? (bloom-contains filter-with-element 10)))
)))

The improved code

; core.clj

(ns bloom-filter.core)


(defn bloom-create [numbits hash-functions]
      (assert (number? numbits) "numbits must be numeric")
      (assert (not (nil? hash-functions)) "hash-functions must not be nil")
      (assert (every? ifn? hash-functions) "hash-functions must be a collection of functions")
      {:bits (vec (repeat numbits false)) 
       :hash-functions hash-functions})

(def ^:private safe-hash-functions (memoize 
      (fn [numbits hash-functions]
      (map #(fn [x] (mod (% x) numbits)) hash-functions))))

(defn bloom-add [{:keys [hash-functions bits] :as bloom-filter} value]
      (when bloom-filter 
      (let [hash-functions (safe-hash-functions (count bits) hash-functions)]
      (->> hash-functions
           (reduce (fn [actual-bits hash-function] (assoc! actual-bits (hash-function value) true)) (transient bits))
           (persistent!)
           (assoc bloom-filter :bits)))))

(defn bloom-contains? [{:keys [hash-functions bits] :as bloom-filter} value] 
      (when bloom-filter
      (let [hash-functions (safe-hash-functions (count bits) hash-functions)]
(every? #(bits (% value)) hash-functions))))

; core-test.clj

(ns bloom-filter.core-test

(:require [clojure.test :refer :all]
          [bloom-filter.core :refer :all]))

(def ^:const F false)
(def ^:const T true)

(defn mod7-fun [num] (mod num 7))
(defn always-zero-fun [dontcare] 0)  
(defn always-ten-fun [dontcare] 10)  

(def empty-filter (bloom-create 7 []))
(def single-fun-filter (bloom-create 7 [mod7-fun]))
(def too-high-range-fun-filter (bloom-create 7 [always-ten-fun]))

(deftest create-test
  (testing "create bloom filter"
    (is (thrown-with-msg? AssertionError #"numbits must be numeric" (bloom-create nil nil)))
    (is (thrown-with-msg? AssertionError #"numbits must be numeric" (bloom-create "a" nil)))
    (is (thrown-with-msg? AssertionError #"hash-functions must not be nil" (bloom-create 0 nil)))
    (is (thrown-with-msg? AssertionError #"hash-functions must be a collection of functions" (bloom-create 0 "a")))
    (is (thrown-with-msg? AssertionError #"hash-functions must be a collection of functions" (bloom-create 0 ["a"])))
    (is (thrown-with-msg? AssertionError #"numbits must be numeric" (bloom-create nil [])))
    (is (= {:bits [] :hash-functions []} (bloom-create 0 [])))
    (is (= {:bits [F] :hash-functions []} (bloom-create 1 [])))
    (is (= {:bits [] :hash-functions [always-zero-fun]} (bloom-create 0 [always-zero-fun])))
))

(deftest add-test
  (let [two-fun-filter (bloom-create 7 [mod7-fun always-zero-fun])]
  (testing "add to bloom filter"
    (is (= nil (bloom-add nil 3)))
    (is (= empty-filter (bloom-add empty-filter nil)))
    (is (= empty-filter (bloom-add empty-filter 10)))
    (is (= [F F F T F F F] (:bits (bloom-add single-fun-filter 3))))
    (is (= [T F F T F F F] (:bits (bloom-add two-fun-filter 3))))
    (is (= [F F F T F F F] (:bits (bloom-add too-high-range-fun-filter 3))))
)))

(deftest contains-test
  (let [filter-with-element (bloom-add single-fun-filter 3)
        filter-too-high-range-with-element (bloom-add too-high-range-fun-filter 3)]
  (testing "bloom filter contains"
    (is (= nil (bloom-contains? nil 0)))
    (is (true? (bloom-contains? empty-filter 0)))
    (is (false? (bloom-contains? single-fun-filter 0)))
    (is (true? (bloom-contains? filter-with-element 3)))
    (is (true? (bloom-contains? filter-with-element 10)))
    (is (true? (bloom-contains? filter-too-high-range-with-element 3)))
)))

Tuesday, October 10, 2017

Experimenting with lazy sequences

Fibonacci

source

(def fib-seq
     (lazy-cat [0 1] (map + (rest fib-seq) fib-seq)))

Powers of two

(def twopowers (lazy-seq (cons 1N (map #(* 2N %) twopowers))))

Factorial

(def fact (lazy-cat [1N] (map * (map #(+ % 2) (range)) fact)))

Saturday, August 12, 2017

TIL with-out-str

with-out-str

Retrieve the contents of println.

Example:

(with-out-str (println "Hello"))
; result "Hello\n"

Sunday, February 5, 2017

TIL: Two simple tricks

Conditionally add a new key-value to a map

(conj m (when value [key value]))

Usage example:

         (conj {
                 :type "if"
                 :cond cond_val
                 :then then_val
         } (when else_val [:else else_val]))

In the above example, the "else"-part is added to the token only if it is not empty.

Source.

Replace nth element in a vector

Clojure> (assoc [1 2 3] 1 5)
[1 5 3]

Usage example:

(defn dialogue[num]
    (let [dacode (map #(- (int %) 48) (Integer/toString num 2))
          phrases [(s/split"It would be extremely painful... for you!"#" ")
                   (s/split"If I pull that off will you die? You're a big guy."#" ")]
          actors ["BANE" "CIA"]]
    (loop [idxs [0 0] finaldial "" code dacode prevbit 2]
    (if (< (count code) 1) finaldial
        (let [bit (first code) idx (idxs bit)]
        (recur (assoc idxs bit (inc idx)) (str finaldial (if (= bit prevbit) " " (str (when (not= 2 prevbit) "\n")
        (actors bit) ": ")) ((phrases bit) idx)) (rest code) bit))))))

Important part:

(assoc idxs bit (inc idx))

In this example there are two counters, stored in idxs vector (initially 0, 0). In each step we update one of the two, indexed by bit

Source.

Saturday, October 8, 2016

TIL: multiline comments

It is possible to form a multiline comment, with the comment function:


(comment

; won't be executed 
;      V
(defn somefunc ; ...
;...

)

There is a caveat, though: code inside the comment must be valid Clojure code!


Update: there is a way to work around the above issue: put the commented out stuff in quotes

(comment "

; won't be executed 
;      V
(defn somefunc ; ...
;...

")

map returns a lazy sequence

Consider the following code:


(defn assertFuzzyEquals [act exp]
  (do (println "asserting equality over " act exp)
  (let [inrange (<= (Math/abs (- act exp)) 1e-2)]
    (if (= inrange false)
      (println "abs(actual - expected) must be <= 1e-2. Expected was " exp " but got: " act))
    (is (= inrange true))))
)


(defn assertVectorsFuzzyEqual [act exp]
  (do (println "asserting fuzzy equality over" act exp)
  (map assertFuzzyEquals act exp))
)

(deftest a-test5
  (testing "Solve quadratic equations"
          (assertVectorsFuzzyEqual [] [])
          (assertVectorsFuzzyEqual [1] [1])
          (assertVectorsFuzzyEqual [1] [2])
          (assertVectorsFuzzyEqual [1] [1 1])
))

The output will be as follows:

asserting fuzzy equality over [] []
asserting fuzzy equality over [1] [1]
asserting fuzzy equality over [1] [2]
asserting fuzzy equality over [1] [1 1]

Ran 1 tests containing 0 assertions.
0 failures, 0 errors.

Why? Because, as the documentation of 'map' states, "[It] returns a lazy sequence consisting of the result of applying f to the set of first items of each coll, followed by applying f to the set of second items in each coll, until any one of the colls is exhausted." (Emphasis mine.)

What does this mean? Well, it means that in our case, assertFuzzyEquals was never called.

How can the realization of a lazy sequence be forced? Using doall. Let's modify the code accordingly:


; ...

(defn assertVectorsFuzzyEqual [act exp]
  (do (println "asserting fuzzy equality over" act exp)
  (doall (map assertFuzzyEquals act exp)))                ;  (1)
)

; ...


By using doall at (1), we force the sequence to be realized, and thus really execute the comparison.

The output is now as follows:

asserting fuzzy equality over [] []
asserting fuzzy equality over [1] [1]
asserting equality over  1 1
asserting fuzzy equality over [1] [2]
asserting equality over  1 2
abs(actual - expected) must be <= 1e-2. Expected was  2  but got:  1

lein test :only breaking.core-test/a-test5

FAIL in (a-test5) (core_test.clj:10)
Solve quadratic equations
expected: (= inrange true)
  actual: (not (= false true))
asserting fuzzy equality over [1] [1 1]
asserting equality over  1 1

Ran 1 tests containing 3 assertions.
1 failures, 0 errors.
Tests failed.

As we expected. (N.B.: It is another story, that two vectors with different lengths are not detected... yet!)

Thursday, January 28, 2016

Understanding lazy sequences

The book The Joy of Clojure, Second Edition shows the following quick-sort example for lazy sequences:
(defn sort-parts [work]
     (lazy-seq
      (loop [[part & parts] work]            ;; Pull apart work - note: work will be a list of lists.
        (if-let [[pivot & xs] (seq part)]    ;; This blows up unless work was a list of lists.
          (let [smaller? #(< % pivot)]       ;; define pivot comparison function.
            (recur (list*                    
                    (filter smaller? xs)     ;; work all < pivot
                    pivot                    ;; work pivot itself
                    (remove smaller? xs)     ;; work all > pivot
                    parts)))                 ;; concat parts
          (when-let [[x & parts] parts]      ;; sort rest if more parts
            (cons x (sort-parts parts)))))))
;; #'joy.q/sort-parts

(defn qsort [xs]
   (sort-parts (list xs)))    ;; The (list) function ensures that we pass sort-parts a list of lists.
In order to better understand, how lazy sequence works, I implemented an equivalent for myself in Java:

public interface NextValueProducer {
 Object getNextValue();
}


import java.util.ArrayList;
import java.util.List;

public class LazySequence {
 private NextValueProducer nvp;
 private List cache;
 
 public LazySequence(NextValueProducer nvp) {
  this.nvp = nvp;
  cache = new ArrayList<>();
 }
 
 public Object getNthElement(int n) {
  if (n < 0) {
   throw new IllegalArgumentException("n has to be positive");
  }
  for (int i = cache.size(); i <= n; ++i) {
   Object produced = nvp.getNextValue();
   if (produced == null) {
    break;
   }
   else {
    cache.add(produced);
   }
  }
  if (cache.size() > n) {
   return cache.get(n);
  }
  else {
   return null;
  }
 }
}


import java.util.ArrayList;
import java.util.List;

public class QuickSortProducer implements NextValueProducer {

 private List<Object> work;
 
 public QuickSortProducer(List nums) {
  work = new ArrayList<>();
  List temp = new ArrayList<>(nums);
  work.add(temp);
 }

 @Override
 public Object getNextValue() {

  List workCopy = new ArrayList<>(work);
  
  while (true) {
  
   List part = new ArrayList<>((List)workCopy.get(0));
   List parts = new ArrayList<>(workCopy.subList(1, workCopy.size()));
   
   if (part.size() > 0) {
    int pivot = (int)part.get(0);
    List xs = part.subList(1, part.size());
    
    List smallers = new ArrayList<>();
    List biggers = new ArrayList<>();
    
    for (Object xObj : xs) {
     int x = (int)xObj;
     if (x < pivot) {
      smallers.add(x);
     }
     else if (x > pivot) {
      biggers.add(x);
     }
    }
    
    workCopy = new ArrayList<>();
    workCopy.add(smallers);
    workCopy.add(pivot);
    workCopy.add(biggers);
    workCopy.addAll(parts);
   }
   else {
    if (parts.size() > 0) {
     work = new ArrayList<>(parts.subList(1, parts.size()));
     return parts.get(0);
    }
    else {
     return null;
    }
   }
  }  
 }

}




Unit tests:
import static org.junit.Assert.*;

import java.util.ArrayList;
import java.util.List;

import org.junit.Test;

public class QuickSortProducerTest {

 @Test
 public void testQuickSortEmptyList() {
  List numsToSort = new ArrayList<>();
  LazySequence seq = new LazySequence(new QuickSortProducer(numsToSort));
  
  Object elem = seq.getNthElement(0);

  assertNull(elem);
 }

 @Test
 public void testQuickSortOneElementList() {
  List numsToSort = new ArrayList<>();
  numsToSort.add(2);
  
  LazySequence seq = new LazySequence(new QuickSortProducer(numsToSort));

  assertEquals(2, seq.getNthElement(0));
  assertEquals(null, seq.getNthElement(1));
 }
 
 @Test
 public void testQuickSortUnsortedList() {
  List numsToSort = new ArrayList<>();
  numsToSort.add(2);
  numsToSort.add(1);
  numsToSort.add(4);
  numsToSort.add(3);
  
  LazySequence seq = new LazySequence(new QuickSortProducer(numsToSort));

  assertEquals(1, seq.getNthElement(0));
  assertEquals(2, seq.getNthElement(1));
  assertEquals(3, seq.getNthElement(2));
  assertEquals(4, seq.getNthElement(3));
  assertEquals(null, seq.getNthElement(4));
 }

}