Skip to content

Commit

Permalink
Added some more functions.
Browse files Browse the repository at this point in the history
Added: grapheme_break, properties and property.
  • Loading branch information
Mehgugs committed Apr 18, 2022
1 parent 7041f1c commit 35c0067
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 5 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
*.exp
*.lib
*.def
.vscode/**
.vscode/**
test/**
87 changes: 85 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,93 @@ Performs a `map` using options suitable for unicode normalization.

Gets the Unicode category for the given codepoint.

- *integer (codepoint)* codepoint
- *integer (codepoint)* `codepoint`

#### *string (unicode category)* `category_string(codepoint)`

Gets the two character Unicode category signifier for the given codepoint.

- *integer (codepoint)* codepoint
- *integer (codepoint)* `codepoint`

#### *boolean, integer (state)* `grapheme_break(codepoint_1, codepoint_2, state)`

Given a pair of consecutive codepoints, return whether a grapheme break is permitted between them (as defined by the extended grapheme clusters in UAX#29).
See below for an example of how to use this function.

- *integer (codepoint)* `codepoint_1`
- *integer (codepoint)* `codepoint_2` The codepoint adjacent to `codepoint_1` in the string.

- *integer* `state` Due to Unicode 9.0.0, this algorithm requires
state to break graphemes. This state can be passed in as an integer which
should be initialized to 0.

#### *table (utf8proc_property_t)* `properties(codepoint)`

Returns a table containing the fields of the given codepoint's `utf8proc_property_t` struct.

- *integer (codepoint)* `codepoint`

#### *integer|boolean* `property(codepoint, field)`

Returns a field from the given codepoint's `utf8proc_property_t` struct.

- *integer (codepoint)* `codepoint`
- *string* `field?` A field you wish to select, if you do not provide a field
the function returns the category field.


### Iterating over the graphemes in a string:

This example can probably be tuned into a much more efficient procedure but this
is written as such to illustrate the principle.

```lua

local wrap = coroutine.wrap
local yield = coroutine.yield

local insert = table.insert
local unpack = table.unpack

local codes = utf8.codes
local char = utf8.char

local lunicode = require"lunicode"

local grapheme_break = lunicode.grapheme_break


local graphemes do
local function graphemes_iterator(string)
local collection = {}
local n, state = 0, 0
local broken
for position, code in codes(string) do
if position == 1 then goto continue end

broken, state = grapheme_break(collection[n], code, state)

if broken then
yield(char(unpack(collection, 1, n)))
collection = {}
n = 0
end

::continue::
insert(collection, code)
n = n + 1
end
end

function graphemes(string) return wrap(graphemes_iterator), string end
end

-- The above function should illustrate how to use the information returned by grapheme_break, when given codepoints and the state.

-- For those curious, a 'cuter' way to achieve an iterator would be the following code below:

local function graphemes_cute(str)
return lunicode.map(str, {CHARBOUND = true}):gmatch("[^\xFF]+")
end

```
129 changes: 127 additions & 2 deletions src/lunicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <stddef.h>
#include <string.h>

#define all_good(x) ((0 <= (x) && (x) <= 0x10FFFF) && utf8proc_codepoint_valid((x)))

static int lunicode_error(lua_State* L, utf8proc_ssize_t code) {
return luaL_error(L, "lunicode: utf8proc error: %s", utf8proc_errmsg(code));
}
Expand Down Expand Up @@ -159,7 +161,7 @@ static int lunicode_normalize(lua_State* L) {
static int lunicode_category(lua_State* L) {
lua_Integer i = luaL_checkinteger(L, 1);

if ((0 <= i && i <= 0x10FFFF) && utf8proc_codepoint_valid(i)) {
if (all_good(i)) {
lua_pushinteger(L, utf8proc_category(i));
} else {
return luaL_error(L, "lunicode.category: Invalid codepoint %d", i);
Expand All @@ -171,7 +173,7 @@ static int lunicode_category(lua_State* L) {
static int lunicode_category_string(lua_State* L) {
lua_Integer i = luaL_checkinteger(L, 1);

if ((0 <= i && i <= 0x10FFFF) && utf8proc_codepoint_valid(i)) {
if (all_good(i)) {
lua_pushlstring(L, utf8proc_category_string(i), 2);
} else {
return luaL_error(L, "lunicode.category: Invalid codepoint %d", i);
Expand All @@ -180,12 +182,135 @@ static int lunicode_category_string(lua_State* L) {
return 1;
}

static int lunicode_grapheme_break(lua_State* L) {
lua_Integer i = luaL_checkinteger(L, 1);
lua_Integer j = luaL_checkinteger(L, 2);
lua_Integer s = luaL_checkinteger(L, 3, 0);
if (all_good(i) && all_good(j)) {

utf8proc_bool out = utf8proc_grapheme_break_stateful(i, j, &s);
lua_pushboolean(L, out);
lua_pushinteger(L, s);
return 2;
} else {
return luaL_error(L, "lunicode.grapheme_break: Invalid codepoints %d %d", i, j);
}
}

static int lunicode_properties(lua_State* L) {
lua_Integer i = luaL_checkinteger(L, 1);
if (all_good(i)) {
utf8proc_property_t* props = utf8proc_get_property(i);

lua_createtable(L, 0, 10);

lua_pushliteral(L, "category");
lua_pushinteger(L, props->category);
lua_settable(L, -3);

lua_pushliteral(L, "combining_class");
lua_pushinteger(L, props->combining_class);
lua_settable(L, -3);

lua_pushliteral(L, "bidi_class");
lua_pushinteger(L, props->bidi_class);
lua_settable(L, -3);

lua_pushliteral(L, "mirrored");
lua_pushboolean(L, props->bidi_mirrored);
lua_settable(L, -3);

lua_pushliteral(L, "ignorable");
lua_pushboolean(L, props->ignorable);
lua_settable(L, -3);

lua_pushliteral(L, "control_boundary");
lua_pushboolean(L, props->control_boundary);
lua_settable(L, -3);

lua_pushliteral(L, "charwidth");
lua_pushinteger(L, props->charwidth);
lua_settable(L, -3);

lua_pushliteral(L, "pad");
lua_pushinteger(L, props->pad);
lua_settable(L, -3);

lua_pushliteral(L, "boundclass");
lua_pushinteger(L, props->boundclass);
lua_settable(L, -3);

return 1;
} else {
return luaL_error(L, "lunicode.properties: Invalid codepoint %0x", i);
}
}

static int lunicode_property(lua_State* L) {
lua_Integer i = luaL_checkinteger(L, 1);

static const char* OPTIONS[] = {
"category",
"combining_class",
"bidi_class",
"mirrored",
"ignorable",
"control_boundary",
"charwidth",
"pad",
"boundclass",
"_missing", NULL};


if (all_good(i)) {
int idx = luaL_checkoption(L, 2, OPTIONS[9], &OPTIONS);
utf8proc_property_t* props = utf8proc_get_property(i);

switch (idx) {
default:
case 9:
case 0:
lua_pushinteger(L, props->category);
return 1;
case 1:
lua_pushinteger(L, props->combining_class);
return 1;
case 2:
lua_pushinteger(L, props->bidi_class);
return 1;
case 3:
lua_pushboolean(L, props->bidi_mirrored);
return 1;
case 4:
lua_pushboolean(L, props->ignorable);
return 1;
case 5:
lua_pushboolean(L, props->control_boundary);
return 1;
case 6:
lua_pushinteger(L, props->charwidth);
return 1;
case 7:
lua_pushinteger(L, props->pad);
return 1;
case 8:
lua_pushinteger(L, props->boundclass);
return 1;
}
} else {
return luaL_error(L, "lunicode.property: Invalid codepoint %0x", i);
}
}

static const luaL_Reg lunicode_methods[] = {
{"valid", lunicode_isvalid},
{"map", lunicode_map},
{"normalize", lunicode_normalize},
{"category", lunicode_category},
{"category_string", lunicode_category_string},
{"grapheme_break", lunicode_grapheme_break},
{"properties", lunicode_properties},
{"property", lunicode_property},
{NULL, NULL}
};

Expand Down

0 comments on commit 35c0067

Please sign in to comment.