[Golang] BK-tree Data Structure Implementation


Go implementation of BK-tree data structure used for approximate string matching in a dictionary. The distance metric here is Levenshtein distance, the minimum number of single-character edits (insertions, deletions or substitutions) required to change one string into the other.

The Go implementation is based on the post [1]. If you have no idea what BK-tree is, read the post first.

Run Code on Go Playground

BK-tree: Implementation of Levenshtein distance is put in the Levenshtein distance section below.

bktree.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
package ld

type BKNode struct {
	Str      string
	Children map[int]*BKNode
}

type BKTree struct {
	root         *BKNode
	DistanceFunc func(string, string) int
}

func NewBKTree() *BKTree {
	return &BKTree{
		root:         nil,
		DistanceFunc: EditDistance,
	}
}

func (b *BKTree) Add(s string) {
	if b.root == nil {
		b.root = &BKNode{Str: s, Children: make(map[int]*BKNode)}
		return
	}

	current := b.root
	for {
		d := b.DistanceFunc(s, current.Str)
		if target, ok := current.Children[d]; ok {
			current = target
		} else {
			current.Children[d] = &BKNode{Str: s, Children: make(map[int]*BKNode)}
			break
		}
	}
}

func (b *BKTree) Search(s string, radius int) []*BKNode {
	var resultList []*BKNode
	if b.root == nil {
		return resultList
	}

	var candidates []*BKNode
	candidates = append(candidates, b.root)
	for len(candidates) > 0 {
		candidate := candidates[0]
		candidates = candidates[1:]

		d := b.DistanceFunc(s, candidate.Str)
		if d <= radius {
			resultList = append(resultList, candidate)
		}

		low := d - radius
		high := d + radius
		for distance, node := range candidate.Children {
			if distance >= low && distance <= high {
				candidates = append(candidates, node)
			}
		}
	}

	return resultList
}

Example of using Bk-tree:

bktree_test.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
package ld

import (
	"encoding/json"
	"os"
	"testing"
)

func ExampleBKTree(t *testing.T) {
	bkt := NewBKTree()
	bkt.Add("some")
	bkt.Add("soft")
	bkt.Add("same")
	bkt.Add("mole")
	bkt.Add("soda")
	bkt.Add("salmon")

	b, err := json.MarshalIndent(bkt.root, "", "  ")
	if err != nil {
		t.Error(err)
	}
	os.Stdout.Write(b)

	r := bkt.Search("sort", 2)
	for _, node := range r {
		println(node.Str)
	}
}

Levenshtein distance

The Wagner-Fischer algorithm: See [2] for more details.

util.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
package ld

func StringToRuneSlice(s string) []rune {
	var r []rune
	for _, runeValue := range s {
		r = append(r, runeValue)
	}
	return r
}

func minimum(a, b, c int) int {
	min := a
	if min > b {
		min = b
	}
	if min > c {
		min = c
	}
	return min
}
wf.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
package ld

// The edit distances here is actually Levenshtein distance, minimum number of
// single-character edits (insertions, deletions or substitutions) required to
// change one string into the other.
func EditDistance(s, t string) int {
	ss := StringToRuneSlice(s)
	m := len(ss) + 1
	tt := StringToRuneSlice(t)
	n := len(tt) + 1

	d := make([][]int, m)
	for i := range d {
		d[i] = make([]int, n)
	}

	for i := 0; i < m; i++ {
		d[i][0] = i
	}

	for j := 0; j < n; j++ {
		d[0][j] = j
	}

	for j := 1; j < n; j++ {
		for i := 1; i < m; i++ {
			var substitutionCost int
			if ss[i-1] == tt[j-1] {
				substitutionCost = 0
			} else {
				substitutionCost = 1
			}
			d[i][j] = minimum(
				d[i-1][j]+1,
				d[i][j-1]+1,
				d[i-1][j-1]+substitutionCost,
			)
		}
	}

	return d[m-1][n-1]
}

Tested on:


References:

[1]
[2][Golang] Wagner-Fischer algorithm for Edit Distance of Two Strings
[3]