-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathtransliterator.py
98 lines (85 loc) · 2.81 KB
/
transliterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Copyright (c) Microsoft Corporation. Licensed under the MIT license.
import requests
import sys
import os
import glob
import operator
import argparse
try:
from indictrans import Transliterator
except ImportError as e:
print("Failed to import " + str(e))
# subscription_key = ''
def get_token(subscription_key):
fetch_token_url = 'https://southeastasia.api.cognitive.microsoft.com/sts/v1.0/issueToken'
headers = {
'Ocp-Apim-Subscription-Key': subscription_key
}
response = requests.post(fetch_token_url, headers=headers)
access_token = str(response.text)
return access_token
def get_transliteration(vocab, headers):
trans={}
if headers is None:
trn = Transliterator(source='eng', target='hin', build_lookup=True)
trans = {item:trn.transform(item) for item in vocab}
else:
base_url = 'https://api.cognitive.microsofttranslator.com'
path = '/transliterate?api-version=3.0&language=hi&fromScript=Latn&toScript=Deva'
count=0
body=[]
constructed_url = base_url + path
query=''
while(count<=6500):
for i in range(count,(count+500),50):
for j in range(i,i+50):
query += vocab[j] + ' '
body.append({'text' : query.strip()})
query=''
response = requests.post(constructed_url, headers=headers, json=body)
result = response.json()
for j,i in enumerate(result):
trans.update({body[j]['text']:i['text']})
body=[]
count += 500
for i in range(count,len(vocab),50):
for j in range(i,i+50):
if j<len(vocab):
query += vocab[j] + ' '
body.append({'text' : query.strip()})
query=''
response = requests.post(constructed_url, headers=headers, json=body)
result = response.json()
for j,i in enumerate(result):
trans.update({body[j]['text']:i['text']})
return trans
def main():
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument("--subscription_key", default=None, type=str, required=False, help="Azure Subscription key for downloading transliterations")
parser.add_argument("--input_file", default=None, type=str, required=True,
help="The roman hindi words vocabulary ")
args = parser.parse_args()
input_file = args.input_file
subscription_key = args.subscription_key
headers = None
if subscription_key is not None:
req_token = get_token(subscription_key)
headers = { 'Accept': 'application/json;text/xml',
'Content-Type': 'application/json',
'Ocp-Apim-Subscription-Key': subscription_key,
'Authorization': req_token
}
vocab = []
with open(input_file,'r+') as infile:
con = infile.readlines()
vocab = [x.strip('\n') for x in con]
trans = get_transliteration(vocab, headers)
with open('transliterations.txt','w+') as outfile:
for i in trans.keys():
words=i.split(' ')
deva=trans.get(i).split(' ')
for j,k in enumerate(words):
outfile.write(k + "\t" + deva[j] +"\n")
if __name__ == "__main__":
main()