Merge pull request #17 from vunb/v1

Release version 1.0
vunb · Nov 6, 2017 · 47f6fa8 · 47f6fa8
2 parents 372aed8 + f79c600
commit 47f6fa8
Show file tree

Hide file tree

Showing 37 changed files with 824 additions and 131,635 deletions.
diff --git a/.gitignore b/.gitignore
@@ -121,4 +121,5 @@ nbproject
 obj
 # build
 deps
-package-lock.json
+package-lock.json
+test.js
diff --git a/.travis.yml b/.travis.yml
@@ -1,3 +1,6 @@
 language: node_js
 node_js:
-  - "4.4"
+- "4.4"
+
+install:
+- npm install --update-binary
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # VNTK
 
-Vietnamese language toolkit
+Vietnamese NLP Toolkit for Node
 
 [![npm version](https://img.shields.io/npm/v/vntk.svg?style=flat)](https://www.npmjs.com/package/vntk)
 [![Travis](https://travis-ci.org/Vunb/vntk.svg?branch=master)](https://travis-ci.org/Vunb/vntk)
@@ -12,65 +12,155 @@ Vietnamese language toolkit
 1. Install [Node.js](http://nodejs.org/)
 2. Run: `$ npm install -g vntk`
 
+If you are interested in contributing to **vntk**, or just hacking on it, then fork it away!
+
 # API Usage
 
-Cách sử dụng các api / tiện ích dòng lệnh hỗ trợ xử lý tiếng Việt.
+* [1. Tokenizer](#1-tokenizer)
+* [2. Word Segmentation](#2-word-segmentation)
+* [3. POS Tagging](#3-pos-tagging)
+* [4. Chunking](#4-chunking)
+* [5. Named Entity Recognition](#5-named-entity-recognition)
+* [6. Utility](#6-utility)
 
-## Tiện ích tách từ
+## 1. Tokenizer
 
-Chạy một tiện ích trong `vntk`, ví dụ cho bài toán tách từ tiếng Việt (word segmentation) như sau:
+> Word Tokenizer using Regular Expression.  
+> Tokenizer is provided to break text into arrays of tokens!
 
-### Xử lý input là một chuỗi
-```bash
-$ vntk ws "Chào mừng bạn đến với đất nước Việt Nam"
-$ Chào mừng bạn đến với đất_nước Việt_Nam
-```
+Example:
 
-### Xử lý input là một tệp tin
-```bash
-$ vntk ws demo.txt another.txt -f
-$ Result: demo.txt.seg, another.txt.seg
+```js
+var vntk = require('vntk');
+var tokenizer = vntk.tokenizer;
+
+console.log(tokenizer.tokenize('Giá khuyến mãi: 140.000đ / kg  ==> giảm được 20%'))
+// [ 'Giá', 'khuyến', 'mãi', ':', '140.000', 'đ', '/', 'kg', '==>', 'giảm', 'được', '20', '%' ]
+
+console.log(tokenizer.stokenize('Giá khuyến mãi: 140.000đ / kg  ==> giảm được 20%'))
+// Giá khuyến mãi : 140.000 đ / kg ==> giảm được 20
 ```
 
-### Xử lý như một thư viện
-```javascript
-var vntk = require("vntk");
-var ws = vntk.ws();
+Command line: `vntk tok <file_name.txt>`
+
+## 2. Word Segmentation
 
-ws.segment("Chào mừng bạn đến với đất nước Việt Nam");
-// Output: Chào mừng bạn đến với đất_nước Việt_Nam
+> Vietnamese Word Segmentation using Conditional Random Fields, called: `word_sent`.  
+> Word_Sent helps break text into arrays of words!
 
-ws.segmentF("ws_demo.txt");
-// Output: ws_demo.txt.seg
+```js
+var vntk = require('vntk');
+var word_sent = vntk.word_sent;
 
-ws.tokenize('Xin chào Việt Nam')
-// Output: ['Xin', 'chào', 'Việt Nam']
+console.log(word_sent.tag('Chào mừng các bạn trẻ tới thành phố Hà Nội'))
+// [ 'Chào mừng', 'các', 'bạn', 'trẻ', 'tới', 'thành phố', 'Hà Nội' ]
 
+console.log(word_sent.tag('Chào mừng các bạn trẻ tới thành phố Hà Nội', 'text'))
+// Chào_mừng các bạn trẻ tới thành_phố Hà_Nội
 ```
 
-## Tiện ích làm sạch văn bản
+Command line: `vntk ws <file_name.txt>`
+
+## 3. POS Tagging
+
+> Vietnamese Part of Speech Tagging using Conditional Random Fields, called: `pos_tag`.  
+> Pos_Tag helps labeling the part of speech of sentences!
+
+```js
+var vntk = require('vntk');
+var pos_tag = vntk.pos_tag;
+
+console.log(pos_tag.tag('Chợ thịt chó nổi tiếng ở TP Hồ Chí Minh bị truy quét'))
+// [ [ 'Chợ', 'N' ],
+//   [ 'thịt', 'N' ],
+//   [ 'chó', 'N' ],
+//   [ 'nổi tiếng', 'A' ],
+//   [ 'ở', 'E' ],
+//   [ 'TP', 'N' ],
+//   [ 'Hồ', 'Np' ],
+//   [ 'Chí', 'Np' ],
+//   [ 'Minh', 'Np' ],
+//   [ 'bị', 'V' ],
+//   [ 'truy quét', 'V' ] ]
+```
 
-### Xử lý input là một chuỗi
-```bash
-$ vntk clean "<span style='color: #4b67a1;'>Xin chào!!!</span>"
-$ Xin chào!!!
+Command line: `vntk pos <file_name.txt>`
+
+## 4. Chunking
+
+> Vietnamese Chunking using Conditional Random Fields  
+> Chucking helps labeling the part of speech of sentences and short phrases (like noun phrases)!
+
+```js
+var vntk = require('vntk');
+var chunking = vntk.chunking;
+
+console.log(chunking.tag('Nhật ký SEA Games ngày 21/8: Ánh Viên thắng giòn giã ở vòng loại.'))
+// [ [ 'Nhật ký', 'N', 'B-NP' ],
+//   [ 'SEA', 'N', 'B-NP' ],
+//   [ 'Games', 'Np', 'B-NP' ],
+//   [ 'ngày', 'N', 'B-NP' ],
+//   [ '21/8', 'M', 'B-NP' ],
+//   [ ':', 'CH', 'O' ],
+//   [ 'Ánh', 'Np', 'B-NP' ],
+//   [ 'Viên', 'Np', 'I-NP' ],
+//   [ 'thắng', 'V', 'B-VP' ],
+//   [ 'giòn giã', 'N', 'B-NP' ],
+//   [ 'ở', 'E', 'B-PP' ],
+//   [ 'vòng', 'N', 'B-NP' ],
+//   [ 'loại', 'N', 'B-NP' ],
+//   [ '.', 'CH', 'O' ] ]
 ```
 
-### Xử lý input là một tệp tin
-```bash
-$ vntk clean demo.html another.html -f
-$ Result: demo.html.cleaned, another.html.cleaned
+Command line: `vntk chunk <file_name.txt>`
+
+## 5. Named Entity Recognition
+
+> Vietnamese Named Entity Recognition (NER) using Conditional Random Fields  
+> In NER, your goal is to find named entities, which tend to be noun phrases (though aren't always)
+
+```js
+var vntk = require('vntk');
+var ner = vntk.ner;
+
+console.log(ner.tag('Chưa tiết lộ lịch trình tới Việt Nam của Tổng thống Mỹ Donald Trump'))
+// [ [ 'Chưa', 'R', 'O', 'O' ],
+//   [ 'tiết lộ', 'V', 'B-VP', 'O' ],
+//   [ 'lịch trình', 'V', 'B-VP', 'O' ],
+//   [ 'tới', 'E', 'B-PP', 'O' ],
+//   [ 'Việt Nam', 'Np', 'B-NP', 'B-LOC' ],
+//   [ 'của', 'E', 'B-PP', 'O' ],
+//   [ 'Tổng thống', 'N', 'B-NP', 'O' ],
+//   [ 'Mỹ', 'Np', 'B-NP', 'B-LOC' ],
+//   [ 'Donald', 'Np', 'B-NP', 'B-PER' ],
+//   [ 'Trump', 'Np', 'B-NP', 'I-PER' ] ]
 ```
 
-### Xử lý như một thư viện
+Command line: `vntk ner <file_name.txt>`
+
+## 6. Utility
+
+### Clean html
+
 ```javascript
 var vntk = require("vntk");
 var util = vntk.util;
 
 util.clean_html("<span style='color: #4b67a1;'>Xin chào!!!</span>");
-// Output: Xin chào!!!
+// Xin chào!!!
 ```
 
+```bash
+# command line
+vntk clean <file_name1.txt>
+```
+
+# Contributing
+
+Pull requests and stars are highly welcome.
+
+For bugs and feature requests, please [create an issue](https://github.com/vunb/vntk/issues/new).
+
 LICENSE
 ========
 

diff --git a/appveyor.yml b/appveyor.yml
@@ -7,7 +7,7 @@ install:
   # Get the latest stable version of Node.js or io.js
   - ps: Install-Product node $env:nodejs_version
   # install modules
-  - npm install
+  - npm install --update-binary
 
 # Post-install test scripts.
 test_script:

diff --git a/lib/chunking/index.js b/lib/chunking/index.js
@@ -0,0 +1,55 @@
+'use strict';
+const path = require('path');
+const crfsuite = require('crfsuite');
+const tokenizer = require('../tokenizer');
+const pos_tag = require('../pos_tag');
+const fe = require('../features');
+
+const logger = require('../logger')('Chucking');
+
+class Chucking {
+
+    constructor() {
+        this.tagger = crfsuite.Tagger();
+        this.logger = logger;
+
+        this.model_filename = path.resolve(__dirname, './model.bin');
+        if (this.tagger.open(this.model_filename)) {
+            logger.info(`open ${this.model_filename} success!`);
+        }
+
+    }
+
+    get template() {
+        return [
+            "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower",
+            "T[0].istitle", "T[-1].istitle", "T[1].istitle",
+            //# word unigram and bigram
+            "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
+            "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
+            //# pos unigram and bigram
+            "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]",
+            "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]",
+            //# chunk
+            "T[-3][2]", "T[-2][2]", "T[-1][2]",
+        ];
+    }
+
+    tag(text) {
+        let pos_tags = pos_tag.tag(text);
+        let tokens = pos_tags.map((tags) => {
+            return [tags[0], tags[1], 'X']
+        });
+
+        let x = this.transform(tokens);
+        let tags = this.tagger.tag(x);
+        return pos_tags.map((pos_tags, index) => [pos_tags[0], pos_tags[1], tags[index]]);
+    }
+
+    transform(tokens) {
+        let template = this.template;
+        return tokens.map((token, i) => fe.word2features(tokens, i, template));
+    }
+}
+
+module.exports = new Chucking();
diff --git a/lib/chunking/model.bin b/lib/chunking/model.bin
diff --git a/lib/features/index.js b/lib/features/index.js
@@ -0,0 +1,2 @@
+// Features Engineering (default)
+module.exports = require('./word_features');
-Original file line number
+Diff line change
@@ Expand Up / @@ -121,4 +121,5 @@ nbproject @@
     obj
     # build
     deps
-    package-lock.json
+    package-lock.json
+    test.js
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		// Features Engineering (default)
		module.exports = require('./word_features');