[Golang] BK-tree Data Structure Implementation
Go implementation of BK-tree data structure used for approximate string matching in a dictionary. The distance metric here is Levenshtein distance, the minimum number of single-character edits (insertions, deletions or substitutions) required to change one string into the other.
The Go implementation is based on the post [1]. If you have no idea what BK-tree is, read the post first.
BK-tree: Implementation of Levenshtein distance is put in the Levenshtein distance section below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | package ld
type BKNode struct {
Str string
Children map[int]*BKNode
}
type BKTree struct {
root *BKNode
DistanceFunc func(string, string) int
}
func NewBKTree() *BKTree {
return &BKTree{
root: nil,
DistanceFunc: EditDistance,
}
}
func (b *BKTree) Add(s string) {
if b.root == nil {
b.root = &BKNode{Str: s, Children: make(map[int]*BKNode)}
return
}
current := b.root
for {
d := b.DistanceFunc(s, current.Str)
if target, ok := current.Children[d]; ok {
current = target
} else {
current.Children[d] = &BKNode{Str: s, Children: make(map[int]*BKNode)}
break
}
}
}
func (b *BKTree) Search(s string, radius int) []*BKNode {
var resultList []*BKNode
if b.root == nil {
return resultList
}
var candidates []*BKNode
candidates = append(candidates, b.root)
for len(candidates) > 0 {
candidate := candidates[0]
candidates = candidates[1:]
d := b.DistanceFunc(s, candidate.Str)
if d <= radius {
resultList = append(resultList, candidate)
}
low := d - radius
high := d + radius
for distance, node := range candidate.Children {
if distance >= low && distance <= high {
candidates = append(candidates, node)
}
}
}
return resultList
}
|
Example of using Bk-tree:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | package ld
import (
"encoding/json"
"os"
"testing"
)
func ExampleBKTree(t *testing.T) {
bkt := NewBKTree()
bkt.Add("some")
bkt.Add("soft")
bkt.Add("same")
bkt.Add("mole")
bkt.Add("soda")
bkt.Add("salmon")
b, err := json.MarshalIndent(bkt.root, "", " ")
if err != nil {
t.Error(err)
}
os.Stdout.Write(b)
r := bkt.Search("sort", 2)
for _, node := range r {
println(node.Str)
}
}
|
Levenshtein distance
The Wagner-Fischer algorithm: See [2] for more details.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | package ld
func StringToRuneSlice(s string) []rune {
var r []rune
for _, runeValue := range s {
r = append(r, runeValue)
}
return r
}
func minimum(a, b, c int) int {
min := a
if min > b {
min = b
}
if min > c {
min = c
}
return min
}
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | package ld
// The edit distances here is actually Levenshtein distance, minimum number of
// single-character edits (insertions, deletions or substitutions) required to
// change one string into the other.
func EditDistance(s, t string) int {
ss := StringToRuneSlice(s)
m := len(ss) + 1
tt := StringToRuneSlice(t)
n := len(tt) + 1
d := make([][]int, m)
for i := range d {
d[i] = make([]int, n)
}
for i := 0; i < m; i++ {
d[i][0] = i
}
for j := 0; j < n; j++ {
d[0][j] = j
}
for j := 1; j < n; j++ {
for i := 1; i < m; i++ {
var substitutionCost int
if ss[i-1] == tt[j-1] {
substitutionCost = 0
} else {
substitutionCost = 1
}
d[i][j] = minimum(
d[i-1][j]+1,
d[i][j-1]+1,
d[i-1][j-1]+substitutionCost,
)
}
}
return d[m-1][n-1]
}
|
Tested on:
- Ubuntu Linux 16.10, Go 1.8
- Go Playground.
References:
[1] |
[2] | [Golang] Wagner-Fischer algorithm for Edit Distance of Two Strings |
[3] |