forked from gorden2/nazisearch
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathclean_vocab_data.py
60 lines (47 loc) · 1.35 KB
/
clean_vocab_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
import json
from pprint import pprint
with open('artist.json') as data_file:
data = json.load(data_file)
print data[0]
'''
#prints the first record for validation of dataset
print'name : '+ (data[0]['name']['value'])
print'label : '+ (data[0]['label']['value'])
'''
#for artist
for i in range(5000,20000):
s = data[i]['label']['value']
s = s.replace(",", " ")
s_n = data[i]['name']['value']
s_n = s_n.replace(",", " ")
try:
u = unicode(s,'utf-8')
backToBytes_l = u.encode('utf-8')
except:
backToBytes_l = s
try:
u = unicode(s_n,'utf-8')
backToBytes = u.encode('utf-8')
except:
backToBytes = s_n
print '{ ' + '"id" :'+ str(i) + ','+'"label" : ' +' " ' + backToBytes_l +' " '+' ,'+'"name" : ' + ' " ' + backToBytes + ' " ' + '} ,'
'''
#for location
for i in range(5000,20000):
s = data[i]['label']['value']
s = s.replace(",", " ")
s_n = data[i]['name']['value']
s_n = s_n.replace(",", " ")
try:
u = unicode(s,'utf-8')
backToBytes_l = u.encode('utf-8')
except:
backToBytes_l = s
try:
u = unicode(s_n,'utf-8')
backToBytes = u.encode('utf-8')
except:
backToBytes = s_n
print '{ ' + '"id" :'+ str(i) + ','+'"label" : ' +' " ' + backToBytes_l +' " '+' ,'+'"name" : ' + ' " ' + backToBytes + ' " ' + '} ,'
'''