-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstreamlit_app.py
325 lines (239 loc) Β· 15 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
import datetime
import numpy as np
import glob
import re
import streamlit as st
from stlibs import SessionState
def download_link(object_to_download, download_filename, download_link_text):
"""
Generates a link to download the given object_to_download.
object_to_download (str, pd.DataFrame): The object to be downloaded.
download_filename (str): filename and extension of file. e.g. mydata.csv, some_txt_output.txt
download_link_text (str): Text to display for download link.
Examples:
download_link(YOUR_DF, 'YOUR_DF.csv', 'Click here to download data!')
download_link(YOUR_STRING, 'YOUR_STRING.txt', 'Click here to download your text!')
"""
if isinstance(object_to_download,pd.DataFrame):
object_to_download = object_to_download.to_csv(index=False)
# some strings <-> bytes conversions necessary here
b64 = base64.b64encode(object_to_download.encode()).decode()
# In[4]:
def display_button():
display_instructions = st.selectbox('''Click here to see explanation''' , ('Close', 'Show Explanation'))
if display_instructions == 'Show Explanation':
st.markdown("""
#### This app is meant to be used internally for running metabolomics quality control (QC)
#### The QC pipeline is running in five phases:
""")
st.write('''
##### 1) Growth control
##### 2) M/Z drift calculation
##### 3) RT drift (not sure yet)
##### 4) Peak intensity
''')
display_button()
st.write(':heavy_minus_sign:' * 35)
st.markdown('''# parameters drift control section''')
def display_button2():
display_instructions = st.selectbox('''Click here for explanation''' , ('Close', 'Show Explanation'))
if display_instructions == 'Show Explanation':
st.markdown('''in this section you will be comparing the new generated data from a historical one to compare the retention time differences.
The analysis will be restricted to the standard samples of the same concentrations. The column 'peak_rt_of_max' of mint output will be used for such comparison. Standard samples will be considered as those containing the Std flag ''')
display_button2()
st.write('### Upload the historical results file generated on Mint ....')
historical_file = st.file_uploader('historical results')
try:
s_st = SessionState.get( historical_results = pd.read_csv(historical_file ))
except:
pass
try:
s_st.historical_results = pd.read_csv(historical_file)
except:
pass
try:
st.write('#### Your historical file:')
st.write(s_st.historical_results.head())
except:
st.write('some point in your settings failed')
st.write('### Upload the newly generated results file ....')
results_file = st.file_uploader('current results')
try:
s_st.results = pd.read_csv(results_file)
s_st.results0 = s_st.results.copy()
except:
pass
try:
st.write('#### Your results file:')
st.write(s_st.results.head())
except:
st.write('some point in your settings failed')
try:
lres = len(s_st.results)
lhres = len(s_st.historical_results)
except:
lres = 0
lhres = 0
if (lres > 1) & (lhres > 1):
### selecting the standard samples only ###
try:
st.write('#### indicate the Standard samples flag')
s_st.std_flag = st.text_input("growth control sample flag", 'Std')
s_st.historical_results = s_st.historical_results[s_st.historical_results.ms_file.str.contains(s_st.std_flag)]
st.write('your historical data has ' + str(len(np.unique(s_st.historical_results.ms_file))) + ' ' + s_st.std_flag + ' samples')
except:
st.write('there is a problem with the standard sample selection in the historical results')
try:
s_st.results = s_st.results[s_st.results.ms_file.str.contains(s_st.std_flag)]
st.write('your results data has ' + str(len(np.unique(s_st.results.ms_file))) + ' ' + s_st.std_flag + ' samples')
if len(np.unique(s_st.results.ms_file)) == len(np.unique(s_st.historical_results.ms_file)):
st.write('which matches the size of the historical results')
except:
st.write('there is a problem with the standard sample selection in the current results')
### numbering the standard samples by the file name ###
try:
s_st.historical_results.STDType = s_st.historical_results.ms_file_label.apply(lambda x: int(x.split('Std')[-1]))
s_st.results['STDType'] = s_st.results.ms_file_label.apply(lambda x: int(x.split('Std')[-1]))
st.write('there are ' + str(len(np.unique(s_st.historical_results.STDType))) + ' types of ' + s_st.std_flag + ' sample types in the historical results file')
st.write('there are ' + str(len(np.unique(s_st.results.STDType))) + ' types of ' + s_st.std_flag + ' sample types in the results file')
s_st.intersection_samples = np.intersect1d(np.unique(s_st.historical_results.STDType), np.unique(s_st.results.STDType))
st.write('with ' + str(len(s_st.intersection_samples)) + ' samples in the intersection')
except:
st.write('there is a problem with the standard sample numbering')
### getting compounds that exist in historical and current data ###
try:
st.write('there are ' + str(len(np.unique(s_st.historical_results.peak_label))) +' compounds in the historical data')
st.write('there are ' + str(len(np.unique(s_st.results.peak_label))) +' compounds in the results data')
s_st.intersection_compounds = np.intersect1d(np.unique(s_st.historical_results.peak_label), np.unique(s_st.results.peak_label))
st.write('with ' + str(len(s_st.intersection_compounds)) + ' compounds in the intersection' )
# s_st.historical_results = s_st.historical_results
except:
st.write('there is a problem with the compounds intersection between the historical data and the current results')
### assessing sample by sample & compound by compound ###
#### TESTING FOR MZ DRIFT ########
if (('peak_mass_diff_50pc' in s_st.historical_results.columns) == False):
st.write('#### mz drift cannot be carried out, the columns for mz drift are missing from historical data')
elif ((('peak_mass_diff_50pc' in s_st.results.columns) == False) & ('peak_mass_diff_50pc' in s_st.historical_results.columns) ):
st.write('#### mz drift cannot be carried out, the columns for mz are missing from current results data')
else:
st.write('running the mz drift analysis')
try:
if (('peak_mass_diff_50pc' in s_st.historical_results.columns) & ('peak_mass_diff_50pc' in s_st.results.columns) ):
st.write('#### indicate a threshold for mz drift')
s_st.mz_dt = float(st.text_input("maximum acceptable mz drift", '1'))
# st.write(list(s_st.intersection_samples))
for compound in s_st.intersection_compounds:
# drift = 0
# k = 0
for sample in s_st.intersection_samples:
# st.write(sample)
n1 = np.mean(s_st.historical_results.peak_mass_diff_50pc[(s_st.historical_results.peak_label == compound) & \
(s_st.historical_results.STDType == sample)])
n2 = np.mean(s_st.results.peak_mass_diff_50pc[(s_st.results.peak_label == compound) & \
(s_st.results.STDType == sample)])
# drift += abs(n1 - n2)
# k += 1
if abs(n1 - n2) > s_st.mz_dt:
st.write('problematic compound: ' + compound + ' in sample: ' + s_st.std_flag + str(sample) + ' with ' + \
str(np.round(abs(n1-n2), 2)) + ' ppm drift' )
# st.write(compound + ' average mz drift is about ' + str(np.round(drift/k, 3)))
except:
st.write('there was a problem while running the mz drift analysis')
#### TESTING FOR RT DRIFT ########
if (('peak_rt_of_max' in s_st.historical_results.columns) == False):
st.write('#### rt drift cannot be carried out, the columns for rt drift are missing from historical data')
elif ((('peak_rt_of_max' in s_st.results.columns) == False) & ('peak_rt_of_max' in s_st.historical_results.columns) ):
st.write('#### mz drift cannot be carried out, the columns for rt drift are missing from current results data')
else:
st.write('#### indicate a threshold for RT drift in percent')
s_st.rt_dt = float( st.text_input("maximum acceptable retention time drift", '1') )
st.write('#### indicate a threshold for the number of problematic samples')
s_st.ps_th1 = float( st.text_input("maximum aceptable number of samples for rt drift", '5') )
st.write('running the rt drift analysis ...')
try:
if (('peak_rt_of_max' in s_st.historical_results.columns) & ('peak_rt_of_max' in s_st.results.columns) ):
for compound in s_st.intersection_compounds:
st.write(len(s_st.intersection_samples))
k = 0
for sample in s_st.intersection_samples:
st.write(sample)
n1 = np.mean(s_st.historical_results.peak_rt_of_max[(s_st.historical_results.peak_label == compound) & \
(s_st.historical_results.STDType == sample)])
n2 = np.mean(s_st.results.peak_rt_of_max[(s_st.results.peak_label == compound) & \
(s_st.results.STDType == sample)])
st.write(n1)
st.write(n2)
if abs(n1 - n2) > s_st.rt_dt:
k += 1
# st.write('problematic compound: ' + compound + ' in sample: ' + s_st.std_flag + str(sample) + ' with ' + \
# str( np.round(100*abs(n1 - n2)/max(n1,n2), 2)) + ' percent in rt drift' )
if k > s_st.ps_th1:
st.write(compound + ' showed problems in ' + str(k) + ' samples')
else:
st.write(compound + ' OK')
except:
st.write('there was a problem while running the rt drift analysis')
#### TESTING FOR PEAK_MAX DRIFT ########
if (('peak_max' in s_st.historical_results.columns) == False):
st.write('#### peak intensity drift cannot be carried out, the columns for peak intensity drift are missing from historical data')
elif ((('peak_max' in s_st.results.columns) == False) & ('peak_max' in s_st.historical_results.columns) ):
st.write('#### peak intensity drift cannot be carried out, the columns for peak intensity drift are missing from current results data')
else:
st.write('#### indicate a threshold for peak height drift in percent')
s_st.ph_dt = float( st.text_input("peak_max difference in percent", '5') )
st.write('#### indicate a threshold for the number of problematic samples')
s_st.ps_th2 = float( st.text_input("maximum aceptable number of samples for peak height drift", '5') )
st.write('running the peak height drift analysis')
try:
if (('peak_max' in s_st.historical_results.columns) & ('peak_max' in s_st.results.columns) ):
for compound in s_st.intersection_compounds:
k = 0
for sample in s_st.intersection_samples:
n1 = np.mean(s_st.historical_results.peak_max[(s_st.historical_results.peak_label == compound) & (s_st.historical_results.STDType == sample)])
n2 = np.mean(s_st.results.peak_max[(s_st.results.peak_label == compound) & \
(s_st.results.STDType == sample)])
if abs(n2 - n1)/max(n1,n2) > s_st.ph_dt/100:
k += 1
# st.write('problematic compound: ' + compound + 'in sample ' + s_st.std_flag + str(sample) + ' with ' +\
# str(100*abs(n2 - n1)/max(n1,n2)) + ' percent in peak height drift' )
if k > s_st.ps_th2:
st.write(compound + ' showed problems in ' + str(k) + ' samples')
else:
st.write(compound + ' OK')
except:
st.write('there was a problem while running the peak height drift analysis')
st.write(':heavy_minus_sign:' * 35)
st.write(':heavy_minus_sign:' * 35)
st.markdown('''# Growth control section''')
try:
st.write('#### indicate the intensity measurement')
s_st.value_column = st.selectbox('select the intensity measurement \n', ['peak_max', 'peak_area'])
st.write(s_st.value_column)
st.write('#### indicate the growth control sample')
s_st.gr_flag = st.text_input("growth control sample flag", 'ATCC')
st.write('#### indicate the media control sample')
s_st.me_flag = st.text_input("media control sample flag", 'MHPool')
st.write('#### indicate the compound used for growing measurement')
s_st.cp = st.selectbox('select the growing measurement compound \n', list(np.unique(s_st.results.peak_label)))
st.write('#### indicate if the compound used for growing measurement is a consuming or a secreting one')
s_st.flux = st.selectbox('select the compound interchange type \n', ['influx', 'eflux'])
st.write('#### set a threshold for the ratio between the growth control samples and the media control samples')
s_st.threshold_0 = float(st.text_input("set a threshold for GControl/MSamples", '100'))
div = np.mean(s_st.results0[s_st.value_column][(s_st.results0.peak_label == s_st.cp) & (s_st.results0.ms_file.str.contains(s_st.gr_flag))]) / \
np.mean(s_st.results0[s_st.value_column][(s_st.results0.peak_label == s_st.cp) & (s_st.results0.ms_file.str.contains(s_st.me_flag))])
st.write('the fraction of the compound between the growth media and the control control samples is:')
st.write(1/div)
if (s_st.flux == 'influx') & (div > 1/s_st.threshold_0):
st.write('# Hey Sr, your controls didnt grow that well πππππ')
elif (s_st.flux == 'eflux') & (div < 1/s_st.threshold_0):
st.write('# Hey Sr, your controls didnt grow that well πππππ')
else:
st.write('# Hey Sr, it looks like your controls grew well π₯³π₯³π₯³π₯³π₯³')
except:
st.write('some point in your settings failed')
st.write(':heavy_minus_sign:' * 35)