-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcunidecode.c
95 lines (71 loc) · 2.24 KB
/
cunidecode.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include <stdio.h>
#include <string.h>
#include <Python.h>
#include "data.h"
static int get_string_size(char *string) {
int size = 0;
while (string[size] != '\0') {
size++;
}
return size;
}
static char *append(char *str_head, char *str_tail) {
int head_size = get_string_size(str_head);
int tail_size = get_string_size(str_tail);
int total_size = head_size + tail_size;
char *ret_string = (char *)malloc(sizeof(char) * (total_size + 1));
strncpy(ret_string, str_head, head_size);
strncpy((ret_string + head_size), str_tail, tail_size);
ret_string[total_size] = '\0';
free(str_head);
return ret_string;
}
static PyObject *cunidecode_unidecode( PyObject *self, PyObject *args ) {
Py_UNICODE *string;
int string_size;
if (!PyArg_ParseTuple(args, "u#", &string, &string_size)) {
return NULL;
}
char *temp_string;
// Build an initial buffer the size of the unicode string.
char *ret_string = (char *)malloc(sizeof(char));
if (ret_string == 0) {
EXIT_FAILURE;
}
ret_string[0] = '\0';
int i, unichar, section, position;
for (i = 0; i < string_size; i++) {
unichar = string[i];
// Only support the Basic Multilingual Plane
if (unichar < 65536) {
section = unichar >> 8;
position = unichar % 256;
temp_string = data[section][position];
} else {
temp_string = "";
}
ret_string = append(ret_string, temp_string);
}
PyObject* ret_val = Py_BuildValue("s", ret_string);
free(ret_string);
return ret_val;
}
static PyMethodDef cunidecode_methods[] = {
{ "unidecode", (PyCFunction)cunidecode_unidecode, METH_VARARGS,
"Transliterate an Unicode object into an ASCII string" },
{ NULL, NULL, 0, NULL }
};
char* module_doc = "\
Transliterate Unicode text into plain 7-bit ASCII.\
\
Example usage:\
>>> from cunidecode import unidecode:\
>>> unidecode(u'\u5317\u4EB0')\
'Bei Jing'\
\
The transliteration uses a straightforward map, and doesn't have alternatives\
for the same character based on language, position, or anything else.\
";
PyMODINIT_FUNC initcunidecode() {
Py_InitModule3("cunidecode", cunidecode_methods, module_doc);
}