[Golang] Replace Pāli Word in Velthuis Scheme With Unicode


Find Pāli word in Velthuis scheme, and replace them with unicode via Golang (Go programming language).

replace.go | repository | view raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package velthuis

import (
	"regexp"
	"strings"
)

var paliWordsInVelthuisScheme = regexp.MustCompile(`[abcdeghijklmnoprstuvyABCDEGHIJKLMNOPRSTUVY"~.]+`)

func velthuis2unicode(str string) (string, bool) {
	previousLetter := ""
	output := ""
	for i := 0; i < len(str); i++ {
		currentLetter := str[i : i+1]
		if i == 0 {
			output += currentLetter
			previousLetter = currentLetter
			continue
		}
		if previousLetter == "." {
			if currentLetter == "n" {
				output = output[0:len(output)-1] + "ṇ"
				previousLetter = currentLetter
				continue
			}
			if currentLetter == "m" {
				output = output[0:len(output)-1] + "ṃ"
				previousLetter = currentLetter
				continue
			}
			if currentLetter == "t" {
				output = output[0:len(output)-1] + "ṭ"
				previousLetter = currentLetter
				continue
			}
			if currentLetter == "d" {
				output = output[0:len(output)-1] + "ḍ"
				previousLetter = currentLetter
				continue
			}
			if currentLetter == "l" {
				output = output[0:len(output)-1] + "ḷ"
				previousLetter = currentLetter
				continue
			}
			return "", false
		}
		if previousLetter == "~" {
			if currentLetter == "n" {
				output = output[0:len(output)-1] + "ñ"
				previousLetter = currentLetter
				continue
			}
			return "", false
		}
		if previousLetter == "\"" {
			if currentLetter == "n" {
				output = output[0:len(output)-1] + "ṅ"
				previousLetter = currentLetter
				continue
			}
			return "", false
		}
		if previousLetter == "a" && currentLetter == "a" {
			output = output[0:len(output)-1] + "ā"
			previousLetter = currentLetter
			continue
		}
		if previousLetter == "i" && currentLetter == "i" {
			output = output[0:len(output)-1] + "ī"
			previousLetter = currentLetter
			continue
		}
		if previousLetter == "u" && currentLetter == "u" {
			output = output[0:len(output)-1] + "ū"
			previousLetter = currentLetter
			continue
		}
		output += currentLetter
		previousLetter = currentLetter
	}
	if output[len(output)-1:len(output)] == "." {
		return "", false
	}
	if output[len(output)-1:len(output)] == "\"" {
		return "", false
	}
	if str == output {
		return "", false
	}
	return output, true
}

func replacePaliWordsInVelthuisSchemeWithUnicode(b []byte) []byte {
	if len(b) == 1 {
		return b
	}
	str := strings.ToLower(string(b))
	if strings.HasSuffix(str, ".net") {
		return b
	}
	if strings.HasSuffix(str, ".tm") {
		return b
	}
	if strings.HasSuffix(str, ".lk") {
		return b
	}
	if output, ok := velthuis2unicode(str); ok {
		println(str + " => " + output)
		return []byte(output)
	}
	return b
}

func ProcessBytes(b []byte) []byte {
	return paliWordsInVelthuisScheme.ReplaceAllFunc(b,
		replacePaliWordsInVelthuisSchemeWithUnicode)
}
replace_test.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
package velthuis

import (
	"io/ioutil"
	"testing"
)

func TestProcessBytes(t *testing.T) {
	path := "/home/siongui/dev/nanda/content/articles/tipitaka/tipitaka%zh.rst"
	b, err := ioutil.ReadFile(path)
	if err != nil {
		panic(err)
	}
	ProcessBytes(b)
	//b2 := ProcessBytes(b)
	//ioutil.WriteFile(path, b2, 0644)
}

Tested on: Ubuntu Linux 16.04, Go 1.6.2.


References:

[1][Golang] Find Pāli Word in Velthuis Scheme
[2]regex - Regular Expression to match only alphabetic characters - Stack Overflow
[3][Golang] Iterate Over UTF-8 Strings (non-ASCII strings)
[4]GitHub - matko/emacs-pali-velthuis: emacs input method to write pali in the latin alphabet augmented with diacritical marks, using the Velthuis method
[5]