-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.go
77 lines (67 loc) · 1.28 KB
/
util.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package simhash
import (
"bytes"
"strings"
"unicode/utf8"
)
func encodeRune(r rune) []byte {
data := make([]byte, utf8.UTFMax)
n := utf8.EncodeRune(data, r)
return data[:n]
}
func encodeRunes(rs []rune) []byte {
data := make([]byte, (len(rs) * utf8.UTFMax))
var n int
for _, r := range rs {
n += utf8.EncodeRune(data[n:], r)
}
return data[:n]
}
func encodeStringsV1(vs []string) []byte {
var n int
for _, v := range vs {
n += len(v)
}
data := make([]byte, n)
var k int
for _, v := range vs {
copy(data[k:], v)
k += len(v)
}
return data
}
func encodeStringsV2(as []string) []byte {
var b bytes.Buffer
for i, a := range as {
if i > 0 {
b.WriteByte('-') // string separator
}
b.WriteString(a)
}
return b.Bytes()
}
var punctuation = mapByRunes([]rune(".,;:-!?"))
func mapByRunes(rs []rune) map[rune]struct{} {
m := make(map[rune]struct{})
for _, r := range rs {
m[r] = struct{}{}
}
return m
}
func runeIsPunctuation(r rune) bool {
_, ok := punctuation[r]
return ok
}
func removePunctuation(r rune) rune {
if _, ok := punctuation[r]; ok {
return -1
}
return r
}
// Words splits the string words
func Words(text string) []string {
// remove punctuation
s := strings.Map(removePunctuation, text)
s = strings.ToLower(s)
return strings.Fields(s)
}