-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
141 lines (121 loc) · 3.39 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
package main
import (
"bufio"
"encoding/gob"
"flag"
"fmt"
"github.com/pkg/errors"
"io"
"log"
_ "net/http/pprof"
"os"
)
var inputFlag = flag.String("input", "input.txt", "path to the input file")
var nSlice = flag.Int("count", 10, "number of the slice files")
var defaultMapLen = flag.Int("maplen", 10000, "default length of Map")
func main() {
// For debugging, you can set an environment variable before running:
// GODEBUG="gctrace=1"
// or uncomment the following line (see debug.go for more detail):
// DEBUG = true
flag.Parse()
fmt.Println("------------------ step 1: split input file -------------------------")
seqTotal, err := SplitInput(*inputFlag, *nSlice)
if err != nil {
log.Fatalf("Failed to split input file. %T:%v\n", err, err)
}
fmt.Println("------------ step 2: find the first non-repeating word --------------")
decWorker := NewDecodeWorker(*nSlice)
defer decWorker.CloseAllFiles()
globalFirstWord := WordDict{"", CountIndex{0, seqTotal}}
// Read each slice file in turn and rebuild hashmap
for _, decoder := range decWorker.decoders {
UniqueWordsMap := BuildUniqueWordsMap(decoder)
// find the word with minimum sequence number in a slice file
firstWord := UniqueWordsMap.FindMinSeqWord(seqTotal)
// find the word with minimum sequence number in each slice file
if firstWord.Seq < globalFirstWord.Seq {
globalFirstWord = firstWord
}
UniqueWordsMap = nil
debug(printMemStats)
}
fmt.Println(globalFirstWord)
}
// SplitInput read each word in the file,
// store it in hashmap,
// and write it into different slice file.
func SplitInput(filename string, nSlice int) (int, error) {
encWorker := NewEncodeWorker(nSlice)
defer encWorker.CloseAllFile()
defer encWorker.FlushAll()
f, err := os.Open(filename)
if err != nil {
return -1, errors.New("Failed to read file")
}
defer closeFile(f)
r := bufio.NewReader(f)
fi, err := f.Stat()
totalSize := int(fi.Size())
sliceSize := totalSize / nSlice
currentSize := 0
wordsMap := make(WordsMap, *defaultMapLen)
byts := make([]byte, 0, 32)
seq := 0 // sequence number of each word
for {
b, err := r.ReadByte()
if err != nil {
if err == io.EOF {
// the input file has been read out
break
} else {
return -1, err
}
}
if isLetter(b) {
byts = append(byts, b)
} else {
if len(byts) != 0 {
// Save the word to WordMap
word := string(byts)
seq++
wordsMap.Add(word, seq)
byts = byts[:0]
}
}
if currentSize > sliceSize {
// for avoiding memory limit exceeded,
// save WordsMap to disk and free up memory as needed
must(encWorker.SaveWordsMap(&wordsMap))
debug(printMemStats)
wordsMap = make(WordsMap, *defaultMapLen)
currentSize = 0
}
currentSize++
}
must(encWorker.SaveWordsMap(&wordsMap))
debug(printMemStats)
return seq, nil
}
// BuildUniqueWordsMap read all data in a slice file,
// merge duplicates and rebuild the WordsMap with unique words
func BuildUniqueWordsMap(dec *gob.Decoder) *WordsMap {
uniqueWordsMap := make(WordsMap, *defaultMapLen)
for {
wd := WordDict{}
err := dec.Decode(&wd)
if err != nil {
if err == io.EOF {
break
} else {
log.Fatalf("Failed to read tmp file.%T:%v/n", err, err)
}
}
if ci, ok := uniqueWordsMap[wd.Word]; ok {
uniqueWordsMap[wd.Word] = CountIndex{wd.Count + ci.Count, ci.Seq}
} else {
uniqueWordsMap[wd.Word] = CountIndex{wd.Count, wd.Seq}
}
}
return &uniqueWordsMap
}