-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunicode.lua
146 lines (126 loc) · 4.25 KB
/
unicode.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
local bit = require("bit")
local bor, band, lshift, rshift, bnot = bit.bor, bit.band, bit.lshift, bit.rshift, bit.bnot
local LE = 0x1
local DE = 0x2
local BYTE_1_HEAD = 0x00 -- 0000 0000
local BYTE_2_HEAD = 0xC0 -- 1100 0000
local BYTE_3_HEAD = 0xE0 -- 1110 0000
local BYTE_TAIL_HEAD = 0x80 -- 1000 0000
local BYTE_1_MASK = 0x80 -- 1000 0000
local BYTE_2_MASK = 0xE0 -- 1110 0000
local BYTE_3_MASK = 0xF0 -- 1111 0000
local BYTE_TAIL_MASK = 0x3F -- 0011 1111
local Format_char = function(b)
return string.format("%c", b)
end
function UTF8To16(utf8, order)
assert(type(utf8) == 'string')
local result, tmp = {}, {}
local i, b1, b2, b3, high, low = 1, 0, 0, 0, 0, 0
local len = #utf8
while i <= len do
b1 = string.byte(utf8, i)
if band(b1, BYTE_1_MASK) == BYTE_1_HEAD then -- 0### ####
low = Format_char(b1); high = "\0"; i = i + 1
elseif band(b1, BYTE_2_MASK) == BYTE_2_HEAD then -- 110# ####
b2 = string.byte(utf8, i + 1)
high = Format_char(rshift(band(bnot(BYTE_2_MASK), b1), 2))
low = Format_char(bor(band(BYTE_TAIL_MASK, b2), lshift(b1, 6)))
i = i + 2
elseif band(b1, BYTE_3_MASK) == BYTE_3_HEAD then -- 1110 ####
b2, b3 = string.byte(utf8, i + 1, i + 2)
high = Format_char(bor(lshift(b1, 4), rshift(band(BYTE_TAIL_MASK, b2), 2)))
low = Format_char(bor(lshift(b2, 6), band(BYTE_TAIL_MASK, b3)))
i = i + 3
end
if order == DE then low, high = high, low end
table.insert(result, low .. high)
end
return table.concat(result)
end
function UTF16To8(utf16, order)
local low, high, r = 0, 0, 0
local result = {}
for i=1, #utf16, 2 do
low, high = string.byte(utf16, i, i + 1)
if order == DE then low, high = high, low end
r = bor(lshift(high, 8), low)
if r <= 0x7F then
table.insert(result, string.format("%c", low))
elseif r >= 0x80 and r <= 0x7FF then
table.insert(result, string.format("%c%c",
bor(BYTE_2_HEAD, rshift(r, 6)),
bor(BYTE_TAIL_HEAD, band(BYTE_TAIL_MASK, r))))
elseif r >= 0x800 and r <= 0xFFFF then
table.insert(result, string.format("%c%c%c",
bor(BYTE_3_HEAD, rshift(r, 12)),
bor(BYTE_TAIL_HEAD, band(BYTE_TAIL_MASK, rshift(r, 6))),
bor(BYTE_TAIL_HEAD, band(BYTE_TAIL_MASK, r))))
end
end
return table.concat(result)
end
-- Tail Call
function UTF8To16_TailCall(utf8, order)
function tail(utf8, start, result)
if start > #utf8 then
return result
end
local b1 = string.byte(utf8, start)
local low, high = 0, 0
if band(b1, BYTE_1_MASK) == BYTE_1_HEAD then -- 0### ####
low = Format_char(b1); high = "\0"; start = start + 1
elseif band(b1, BYTE_2_MASK) == BYTE_2_HEAD then -- 110# ####
b2 = string.byte(utf8, start + 1)
high = Format_char(rshift(band(bnot(BYTE_2_MASK), b1), 2))
low = Format_char(bor(band(BYTE_TAIL_MASK, b2), lshift(b1, 6)))
start = start + 2
elseif band(b1, BYTE_3_MASK) == BYTE_3_HEAD then -- 1110 ####
b2, b3 = string.byte(utf8, start + 1, start + 2)
high = Format_char(bor(lshift(b1, 4), rshift(band(BYTE_TAIL_MASK, b2), 2)))
low = Format_char(bor(lshift(b2, 6), band(BYTE_TAIL_MASK, b3)))
start = start + 3
end
if order == DE then low, high = high, low end
table.insert(result, low .. high)
return tail(utf8, start, result)
end
return table.concat(tail(utf8, 1, {}))
end
function UTF16To8_TailCall(utf16, order)
function tail(utf16_str, start, result)
if start > #utf16_str then
return result
end
local low, high = string.byte(utf16_str, start, start + 1)
if order == DE then low, high = high, low end
local r = bor(lshift(high, 8), low)
if r <= 0x7F then
table.insert(result, string.format("%c", low))
elseif r >= 0x80 and r <= 0x7FF then
table.insert(result, string.format("%c%c",
bor(BYTE_2_HEAD, rshift(r, 6)),
bor(BYTE_TAIL_HEAD, band(BYTE_TAIL_MASK, r))))
elseif r >= 0x800 and r <= 0xFFFF then
table.insert(result, string.format("%c%c%c",
bor(BYTE_3_HEAD, rshift(r, 12)),
bor(BYTE_TAIL_HEAD, band(BYTE_TAIL_MASK, rshift(r, 6))),
bor(BYTE_TAIL_HEAD, band(BYTE_TAIL_MASK, r))))
end
return tail(utf16_str, start + 2, result)
end
return table.concat(tail(utf16, 1, {}))
end
-- Test
local test = "a朱a"
print(test)
do
local r = UTF8To16_TailCall(test, LE)
local r2 = UTF16To8_TailCall(r, LE)
print(r2)
end
do
local r = UTF8To16(test, LE)
local r2 = UTF16To8(r, LE)
print(r2)
end