-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathtest_remove_repeat_sentences_mapper.py
72 lines (58 loc) · 5.03 KB
/
test_remove_repeat_sentences_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# flake8: noqa: E501
import unittest
from data_juicer.core.data import NestedDataset as Dataset
from data_juicer.ops.mapper.remove_repeat_sentences_mapper import \
RemoveRepeatSentencesMapper
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
class RemoveRepeatSentencesMapperTest(DataJuicerTestCaseBase):
def _run_helper(self, samples, op):
dataset = Dataset.from_list(samples)
dataset = dataset.map(op.process, batch_size=2)
for data in dataset:
self.assertEqual(data['text'], data['target'])
def test_text(self):
samples = [{
'text':
'今天天气真不错,阳光明媚,适合出去散步。小明说:“今天天气真不错,我们去海边吧。” 小红回答说:“好主意!” 但是,小李觉得:“今天天气真不错,我们去爬山吧。” 今天天气真不错,阳光明媚,适合出去散步。昨天下了一整天的雨,今天终于放晴了。昨天下了一整天的雨,今天终于放晴了。',
'target':
'今天天气真不错,阳光明媚,适合出去散步。小明说:“今天天气真不错,我们去海边吧。” 小红回答说:“好主意!” 但是,小李觉得:“今天天气真不错,我们去爬山吧。”昨天下了一整天的雨,今天终于放晴了。',
}, {
'text':
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
'target':
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
}, {
'text': '''我很开心 。但是你不开心 。我很开心 。\n你好呀!我很开心 。我好的。你好呀!''',
'target': '''我很开心 。但是你不开心 。\n你好呀!我好的。'''
}, {
'text':
'默认配置下,长度低于2的句子不会被去重。去重?去重。去重!重。重...... 重! 1234?3215. 1234. 3. 3. 3',
'target':
'默认配置下,长度低于2的句子不会被去重。去重?重。重...... 重! 1234?3215. 3. 3. 3'
}]
op = RemoveRepeatSentencesMapper()
self._run_helper(samples, op)
def test_text2(self):
samples = [{
'text':
'Life is what happens when you\'re busy making other plans. John Lennon once said. Life is what happens when you\'re busy making other plans. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说,这句话引起了共鸣。',
'target':
'Life is what happens when you\'re busy making other plans. John Lennon once said. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说,这句话引起了共鸣。',
}, {
'text':
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
'target':
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
}, {
'text': '''我很开心 。但是你不开心 。我很开心 。\n你好呀!我很开心 。我好的。你好呀!''',
'target': '''我很开心 。但是你不开心 。\n你好呀!我好的。你好呀!'''
}, {
'text': '去重?去重。去重!重。重...... 重! 1234?3215. 1234. 3. 3. 3',
'target': '去重?去重。去重!重。重...... 重! 1234?3215. 1234. 3. 3. 3'
}]
op = RemoveRepeatSentencesMapper(lowercase=True,
ignore_special_character=False,
min_repeat_sentence_length=5)
self._run_helper(samples, op)
if __name__ == '__main__':
unittest.main()