-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMergeTables.py
executable file
·156 lines (142 loc) · 6.23 KB
/
MergeTables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python2.7
# @Name
# @Copyright Bruno Costa ITQB 2016
# @Description This is a program to merge files based on common columns must like in sql tabels
# @Description if nao key is found no value will be appended. This assumes that keys are unique. If not unique ------
#input1 - path(str) "Key value pairs Required
#input2 - path(str) "File with keys Required
#output - path(str) "File with values appended to input2 Required
#sep1 - str "Seperator cac" Optional
#sep2 - str "Seperator caracter" Optional
#column_key_input1 int Required
#column_key_input2 Required
#column_value_input2 Optional
import argparse
import operator, functools
parser = argparse.ArgumentParser(description='This is used to parse the blast result')
## blast result, output, cutoff,
#parser.add_argument('--flag', type=str, nargs=1, metavar='', dest='', required=True, help='')
parser.add_argument('--input1', type=str, metavar='input file', dest='input1', required=True, help='Path to the file with the keys and values')
parser.add_argument('--input2', type=str, metavar='input file', dest='input2', required=True, help='Path to the file with the key to which the values should be appended')
parser.add_argument('--output', type=str, metavar='output file', dest='output', required=True, help='Path where the file with the values appended should be stored')
parser.add_argument('--sep1', type=str, metavar='seperation string', dest='sep1', required=False, help='Set the seperation string for input 1. Default is \t')
parser.add_argument('--sep2', type=str, metavar='seperation string', dest='sep2', required=False, help='Set the seperation string for input 2. Default is \t')
parser.add_argument('--col_key_num_input1', type=int, metavar='column key number', dest='cKeyInput1', required=True, help='')
parser.add_argument('--col_key_num_input2', type=int, metavar='column key number', dest='cKeyInput2', required=True, help='')
parser.add_argument('--col_value_num_input2', type=int, metavar='column value number', dest='cValInput2', required=False, help='')
parser.add_argument('--log', type=int, metavar='log file', dest='log', required=False, help='Path where the log file should be stored with the information abount which keys wheren\'t found')
parser.add_argument('--append_result', type=bool , metavar='Result type', dest='result', required=False, help='')
args = parser.parse_args()
#Define variables
input1=args.input1
input2=args.input2
output=args.output
if(args.sep1==None):
print("No input give will you tab as a seperator")
sep1="\t"
else:
sep1=args.sep1
if(args.sep2==None):
sep2="\t"
else:
sep2=args.sep2
if(args.result==None):
result_format=False
else:
result_format=True
col_key_input1=args.cKeyInput1
col_key_input2=args.cKeyInput2
col_val_input2=args.cValInput2
log=args.log
#Open files
key_value=open(input2,"r")
main_file=open(input1,"r")
writer=open(output,"w")
if(log!=None):
logger=open(log,"w")
log=True
else:
log=False
#Process key_value
key_value=[line.strip().split(sep2) for line in key_value.readlines()]
#Save header
header_key_value=key_value[0]
#Delete header
del key_value[0]
dict_key_value={}
if(col_val_input2==None):
header_key_value=header_key_value[0:col_key_input2]+header_key_value[(col_key_input2+1):len(header_key_value)]
#If no column value number is given all column other then key are the value
for line in key_value:
try:
#If the first element of this value is a string
#This is to create multiple lines in case the keys aren't unique
if(type(dict_key_value[line[col_key_input2]][0])==str):
tmp=dict_key_value[line[col_key_input2]]
dict_key_value[line[col_key_input2]]=[]
dict_key_value[line[col_key_input2]].append(tmp)
dict_key_value[line[col_key_input2]].append(line[0:col_key_input2] + line[(col_key_input2+1):len(line)])
else:
dict_key_value[line[col_key_input2]].append(line[0:col_key_input2] + line[(col_key_input2+1):len(line)])
except KeyError:
dict_key_value[line[col_key_input2]]=line[0:col_key_input2] + line[(col_key_input2+1):len(line)]
else:
header_key_value=[header_key_value[col_val_input2]]
#Column with value is given one column is key the other is value other columns are discarted
for line in key_value:
try:
if(type(dict_key_value[line[col_key_input2]])==str):
#Transform to list if more keys are found
tmp=dict_key_value[line[col_key_input2]]
dict_key_value[line[col_key_input2]]=[]
dict_key_value[line[col_key_input2]].append(tmp)
dict_key_value[line[col_key_input2]].append(line[col_val_input2])
else:
#The first add as string
dict_key_value[line[col_key_input2]].append(line[col_val_input2])
except KeyError:
dict_key_value[line[col_key_input2]]=line[col_val_input2]
#Start parsing main file
main_file=[line.strip().split(sep1) for line in main_file.readlines()]
header=main_file[0]
del main_file[0]
result_str=functools.reduce(lambda a, b :a+"\t"+b, header+header_key_value)
writer.write(result_str+"\n")
for line in main_file:
#for target in targets_results:
try:
#annotations=d[[]]
values=dict_key_value[line[col_key_input1]]
if(result_format):
if(type(values)==str):
result_str=functools.reduce(lambda a, b :a+"\t"+b, line+[values])
writer.write(result_str+"\n")
else:
result_str=functools.reduce(lambda a, b :a+"\t"+b, line+values)#+"\t"+functools.reduce(lambda a, b :a+";"+b, values)
writer.write(result_str+"\n")
else:
if(type(values)==str):
print(type(line))
print(type(values))
result_str=functools.reduce(lambda a, b :a+"\t"+b, line+[values])
writer.write(result_str+"\n")
else:
for value in values:
result_str=functools.reduce(lambda a, b :a+"\t"+b, line+[value])
writer.write(result_str+"\n")
except (IndexError, KeyError), e:
if(log):
logger.write("Error - " + str(e))
else:
print("Error - " + str(e))
if(type(dict_key_value[dict_key_value.keys()[0]][0])==str):
result_str=functools.reduce(lambda a, b :a+"\t"+b, line+["0"])#["-"]*len(dict_key_value[dict_key_value.keys()[0]]) )
else:
result_str=functools.reduce(lambda a, b :a+"\t"+b, line+["0"])#["-"]*len(dict_key_value[dict_key_value.keys()[0]][0]) )
writer.write(result_str+"\n")
writer.flush()
writer.flush()
writer.close()
if(log):
logger.flush()
logger.close()