Skip to content

Commit

Permalink
Merge pull request #5 from antigenomics/dev_alex
Browse files Browse the repository at this point in the history
Added TCRGP
  • Loading branch information
alexmorphine authored Oct 3, 2020
2 parents 25447dd + bf90abf commit af67ca9
Show file tree
Hide file tree
Showing 9 changed files with 1,710 additions and 12 deletions.
435 changes: 435 additions & 0 deletions TCRGP/tcrgp.ipynb

Large diffs are not rendered by default.

513 changes: 513 additions & 0 deletions TCRGP/tcrgp_db_creation.ipynb

Large diffs are not rendered by default.

85 changes: 85 additions & 0 deletions pMTnet/duplicate_Gee2018_VDJdb_TRA.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
CDR3,Antigen,HLA,label,Rank
CALSEAGMDSNYQLIW,SMGVTYEM,A*02,1.0,0.451
CALSEAGMDSNYQLIW,YMGVSYEM,A*02,1.0,0.645
CALSEAGMDSNYQLIW,YMGVVYEM,A*02,1.0,0.29700000000000004
CALSEAGMDSNYQLIW,KMGVTYEM,A*02,1.0,0.4310000000000001
CALSEAGMDSNYQLIW,FMGVTYEM,A*02,1.0,0.5880000000000001
CALSEARGGATNKLIF,LMDMHNGQL,A*02,1.0,0.8
CALSEARGGATNKLIF,RLDAMNGQL,A*02,1.0,0.56
CALSEARGGATNKLIF,RMDYNNMQM,A*02,1.0,0.877
CALSEARGGATNKLIF,GMDYHNGHL,A*02,1.0,0.458
CALSEARGGATNKLIF,KMDYFSGQL,A*02,1.0,0.764
CALSEARGGATNKLIF,SMDWFQGQM,A*02,1.0,0.7909999999999999
CALSEARGGATNKLIF,LMDYWQGQL,A*02,1.0,0.495
CALSEARGGATNKLIF,NMMWFQGQL,A*02,1.0,0.812
CALSEARGGATNKLIF,VLDLFQGQL,A*02,1.0,0.7050000000000001
CALSEARGGATNKLIF,MMDFFNAQM,A*02,1.0,0.865
CALSEARGGATNKLIF,TMDFYQGQL,A*02,1.0,0.7390000000000001
CALSSRGSTLGRLYF,RMEQVDWTV,A*02,1.0,0.15300000000000002
CALSSRGSTLGRLYF,KLEFMDWRL,A*02,1.0,0.206
CALSSRGSTLGRLYF,WLDNFELCL,A*02,1.0,0.138
CALSSRGSTLGRLYF,TLEYMDWLV,A*02,1.0,0.09499999999999996
CALSSRGSTLGRLYF,EMMLFDWKV,A*02,1.0,0.31000000000000005
CALSSRGSTLGRLYF,KLEQLDWTV,A*02,1.0,0.09599999999999996
CALSSRGSTLGRLYF,TMETIDWKV,A*02,1.0,0.338
CALSSRGSTLGRLYF,TLEELDWCL,A*02,1.0,0.474
CALSSRGSTLGRLYF,LLEDLDWDV,A*02,1.0,0.34600000000000003
CALSSRGSTLGRLYF,VLEEVDWLI,A*02,1.0,0.23
CALSSRGSTLGRLYF,NMEYMTWDV,A*02,1.0,0.005700000000000038
CALSSRGSTLGRLYF,NVEYYDIKL,A*02,1.0,0.218
CALSEAGMDSNYQLIW,WLDNFELCL,A*02,0.0,0.16500000000000006
CALSEAGMDSNYQLIW,KLEQLDWTV,A*02,0.0,0.4340000000000001
CALSEAGMDSNYQLIW,KMDYFSGQL,A*02,0.0,0.7190000000000001
CALSEAGMDSNYQLIW,RMEQVDWTV,A*02,0.0,0.342
CALSEAGMDSNYQLIW,MMDFFNAQM,A*02,0.0,0.012700000000000045
CALSEAGMDSNYQLIW,GMDYHNGHL,A*02,0.0,0.596
CALSEAGMDSNYQLIW,RLDAMNGQL,A*02,0.0,0.486
CALSEAGMDSNYQLIW,RMDYNNMQM,A*02,0.0,0.627
CALSEAGMDSNYQLIW,TLEYMDWLV,A*02,0.0,0.29500000000000004
CALSEAGMDSNYQLIW,LMDYWQGQL,A*02,0.0,0.81
CALSEAGMDSNYQLIW,VLDLFQGQL,A*02,0.0,0.5800000000000001
CALSEAGMDSNYQLIW,NMEYMTWDV,A*02,0.0,0.28800000000000003
CALSEAGMDSNYQLIW,EMMLFDWKV,A*02,0.0,0.4270000000000001
CALSEAGMDSNYQLIW,NMMWFQGQL,A*02,0.0,0.6990000000000001
CALSEAGMDSNYQLIW,LMDMHNGQL,A*02,0.0,0.885
CALSEAGMDSNYQLIW,TMDFYQGQL,A*02,0.0,0.917
CALSEAGMDSNYQLIW,TMETIDWKV,A*02,0.0,0.15100000000000002
CALSEAGMDSNYQLIW,NVEYYDIKL,A*02,0.0,0.18600000000000005
CALSEAGMDSNYQLIW,SMDWFQGQM,A*02,0.0,0.7240000000000001
CALSEAGMDSNYQLIW,LLEDLDWDV,A*02,0.0,0.28600000000000003
CALSEAGMDSNYQLIW,KLEFMDWRL,A*02,0.0,0.787
CALSEAGMDSNYQLIW,VLEEVDWLI,A*02,0.0,0.271
CALSEAGMDSNYQLIW,TLEELDWCL,A*02,0.0,0.442
CALSEARGGATNKLIF,WLDNFELCL,A*02,0.0,0.5880000000000001
CALSEARGGATNKLIF,KLEQLDWTV,A*02,0.0,0.475
CALSEARGGATNKLIF,RMEQVDWTV,A*02,0.0,0.264
CALSEARGGATNKLIF,SMGVTYEM,A*02,0.0,0.882
CALSEARGGATNKLIF,TLEYMDWLV,A*02,0.0,0.23800000000000002
CALSEARGGATNKLIF,YMGVSYEM,A*02,0.0,0.8909999999999999
CALSEARGGATNKLIF,NMEYMTWDV,A*02,0.0,0.234
CALSEARGGATNKLIF,EMMLFDWKV,A*02,0.0,0.211
CALSEARGGATNKLIF,FMGVTYEM,A*02,0.0,0.909
CALSEARGGATNKLIF,TMETIDWKV,A*02,0.0,0.047000000000000035
CALSEARGGATNKLIF,NVEYYDIKL,A*02,0.0,0.604
CALSEARGGATNKLIF,YMGVVYEM,A*02,0.0,0.511
CALSEARGGATNKLIF,LLEDLDWDV,A*02,0.0,0.5409999999999999
CALSEARGGATNKLIF,KLEFMDWRL,A*02,0.0,0.726
CALSEARGGATNKLIF,KMGVTYEM,A*02,0.0,0.892
CALSEARGGATNKLIF,VLEEVDWLI,A*02,0.0,0.618
CALSEARGGATNKLIF,TLEELDWCL,A*02,0.0,0.6509999999999999
CALSSRGSTLGRLYF,KMDYFSGQL,A*02,0.0,0.18400000000000005
CALSSRGSTLGRLYF,MMDFFNAQM,A*02,0.0,0.342
CALSSRGSTLGRLYF,SMGVTYEM,A*02,0.0,0.468
CALSSRGSTLGRLYF,GMDYHNGHL,A*02,0.0,0.515
CALSSRGSTLGRLYF,RLDAMNGQL,A*02,0.0,0.376
CALSSRGSTLGRLYF,RMDYNNMQM,A*02,0.0,0.04200000000000004
CALSSRGSTLGRLYF,LMDYWQGQL,A*02,0.0,0.249
CALSSRGSTLGRLYF,YMGVSYEM,A*02,0.0,0.16600000000000004
CALSSRGSTLGRLYF,VLDLFQGQL,A*02,0.0,0.782
CALSSRGSTLGRLYF,NMMWFQGQL,A*02,0.0,0.23800000000000002
CALSSRGSTLGRLYF,LMDMHNGQL,A*02,0.0,0.333
CALSSRGSTLGRLYF,TMDFYQGQL,A*02,0.0,0.452
CALSSRGSTLGRLYF,FMGVTYEM,A*02,0.0,0.239
CALSSRGSTLGRLYF,YMGVVYEM,A*02,0.0,0.026000000000000027
CALSSRGSTLGRLYF,SMDWFQGQM,A*02,0.0,0.05500000000000005
CALSSRGSTLGRLYF,KMGVTYEM,A*02,0.0,0.514
146 changes: 146 additions & 0 deletions pMTnet/duplicate_added_trb_nlv_TRB.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
CDR3,Antigen,HLA,label,Rank
CASSLVNGLGYTF,SMGVTYEM,A*02,1.0,0.89
CASSLVNGLGYTF,YMGVSYEM,A*02,1.0,0.884
CASSLVNGLGYTF,YMGVVYEM,A*02,1.0,0.26899999999999996
CASSLVNGLGYTF,KMGVTYEM,A*02,1.0,0.902
CASSLVNGLGYTF,FMGVTYEM,A*02,1.0,0.905
CASSRDTVNTEAFF,LMDMHNGQL,A*02,1.0,0.409
CASSRDTVNTEAFF,RLDAMNGQL,A*02,1.0,0.565
CASSRDTVNTEAFF,RMDYNNMQM,A*02,1.0,0.133
CASSRDTVNTEAFF,GMDYHNGHL,A*02,1.0,0.522
CASSRDTVNTEAFF,KMDYFSGQL,A*02,1.0,0.251
CASSRDTVNTEAFF,SMDWFQGQM,A*02,1.0,0.07399999999999995
CASSRDTVNTEAFF,LMDYWQGQL,A*02,1.0,0.371
CASSRDTVNTEAFF,NMMWFQGQL,A*02,1.0,0.565
CASSRDTVNTEAFF,VLDLFQGQL,A*02,1.0,0.569
CASSRDTVNTEAFF,MMDFFNAQM,A*02,1.0,0.392
CASSRDFVSNEQYF,TMDFYQGQL,A*02,1.0,0.07999999999999996
CASSRDFVSNEQYF,KMDYFSGQL,A*02,1.0,0.337
CASSRDFVSNEQYF,SMDWFQGQM,A*02,1.0,0.753
CASSRDFVSNEQYF,LMDYWQGQL,A*02,1.0,0.13699999999999998
CASSRDFVSNEQYF,NMMWFQGQL,A*02,1.0,0.07099999999999995
CASSRDFVSNEQYF,MMDFFNAQM,A*02,1.0,0.3120000000000001
CASSPSGLAGSNLGNEQFF,RMEQVDWTV,A*02,1.0,0.95
CASSPSGLAGSNLGNEQFF,KLEFMDWRL,A*02,1.0,0.645
CASSPSGLAGSNLGNEQFF,WLDNFELCL,A*02,1.0,0.9109999999999999
CASSPSGLAGSNLGNEQFF,TLEYMDWLV,A*02,1.0,0.948
CASSPSGLAGSNLGNEQFF,EMMLFDWKV,A*02,1.0,0.6779999999999999
CASSPSGLAGSNLGNEQFF,KLEQLDWTV,A*02,1.0,0.99
CASSPSGLAGSNLGNEQFF,TMETIDWKV,A*02,1.0,0.9620000000000001
CASSPSGLAGSNLGNEQFF,TLEELDWCL,A*02,1.0,0.211
CASSPSGLAGSNLGNEQFF,LLEDLDWDV,A*02,1.0,0.835
CASSPSGLAGSNLGNEQFF,VLEEVDWLI,A*02,1.0,0.836
CASSPSGLAGSNLGNEQFF,NMEYMTWDV,A*02,1.0,0.9740000000000001
CASSPSGLAGSNLGNEQFF,NVEYYDIKL,A*02,1.0,0.012299999999999978
CASSLAPGATNEKLFF,NLVPMVATV,A*02,1.0,0.0020999999999999908
CASSLAPGATNEKLFF,WLDNFELCL,A*02,0.0,0.202
CASSLAPGATNEKLFF,KLEQLDWTV,A*02,0.0,0.14
CASSLAPGATNEKLFF,KMDYFSGQL,A*02,0.0,0.604
CASSLAPGATNEKLFF,RMEQVDWTV,A*02,0.0,0.217
CASSLAPGATNEKLFF,MMDFFNAQM,A*02,0.0,0.5720000000000001
CASSLAPGATNEKLFF,SMGVTYEM,A*02,0.0,0.55
CASSLAPGATNEKLFF,GMDYHNGHL,A*02,0.0,0.336
CASSLAPGATNEKLFF,RLDAMNGQL,A*02,0.0,0.451
CASSLAPGATNEKLFF,RMDYNNMQM,A*02,0.0,0.758
CASSLAPGATNEKLFF,TLEYMDWLV,A*02,0.0,0.08299999999999996
CASSLAPGATNEKLFF,LMDYWQGQL,A*02,0.0,0.47600000000000003
CASSLAPGATNEKLFF,YMGVSYEM,A*02,0.0,0.457
CASSLAPGATNEKLFF,VLDLFQGQL,A*02,0.0,0.912
CASSLAPGATNEKLFF,NMEYMTWDV,A*02,0.0,0.015800000000000036
CASSLAPGATNEKLFF,EMMLFDWKV,A*02,0.0,0.752
CASSLAPGATNEKLFF,NMMWFQGQL,A*02,0.0,0.625
CASSLAPGATNEKLFF,LMDMHNGQL,A*02,0.0,0.517
CASSLAPGATNEKLFF,TMDFYQGQL,A*02,0.0,0.49700000000000005
CASSLAPGATNEKLFF,FMGVTYEM,A*02,0.0,0.52
CASSLAPGATNEKLFF,TMETIDWKV,A*02,0.0,0.027000000000000024
CASSLAPGATNEKLFF,NVEYYDIKL,A*02,0.0,0.787
CASSLAPGATNEKLFF,YMGVVYEM,A*02,0.0,0.648
CASSLAPGATNEKLFF,SMDWFQGQM,A*02,0.0,0.787
CASSLAPGATNEKLFF,LLEDLDWDV,A*02,0.0,0.025000000000000026
CASSLAPGATNEKLFF,KLEFMDWRL,A*02,0.0,0.30400000000000005
CASSLAPGATNEKLFF,KMGVTYEM,A*02,0.0,0.53
CASSLAPGATNEKLFF,VLEEVDWLI,A*02,0.0,0.276
CASSLAPGATNEKLFF,TLEELDWCL,A*02,0.0,0.619
CASSLVNGLGYTF,WLDNFELCL,A*02,0.0,0.40399999999999997
CASSLVNGLGYTF,KLEQLDWTV,A*02,0.0,0.29100000000000004
CASSLVNGLGYTF,KMDYFSGQL,A*02,0.0,0.418
CASSLVNGLGYTF,RMEQVDWTV,A*02,0.0,0.412
CASSLVNGLGYTF,MMDFFNAQM,A*02,0.0,0.07499999999999996
CASSLVNGLGYTF,GMDYHNGHL,A*02,0.0,0.417
CASSLVNGLGYTF,RLDAMNGQL,A*02,0.0,0.341
CASSLVNGLGYTF,RMDYNNMQM,A*02,0.0,0.585
CASSLVNGLGYTF,TLEYMDWLV,A*02,0.0,0.381
CASSLVNGLGYTF,NLVPMVATV,A*02,0.0,0.5329999999999999
CASSLVNGLGYTF,LMDYWQGQL,A*02,0.0,0.327
CASSLVNGLGYTF,VLDLFQGQL,A*02,0.0,0.759
CASSLVNGLGYTF,NMEYMTWDV,A*02,0.0,0.5509999999999999
CASSLVNGLGYTF,EMMLFDWKV,A*02,0.0,0.5680000000000001
CASSLVNGLGYTF,NMMWFQGQL,A*02,0.0,0.132
CASSLVNGLGYTF,LMDMHNGQL,A*02,0.0,0.765
CASSLVNGLGYTF,TMDFYQGQL,A*02,0.0,0.474
CASSLVNGLGYTF,TMETIDWKV,A*02,0.0,0.447
CASSLVNGLGYTF,NVEYYDIKL,A*02,0.0,0.732
CASSLVNGLGYTF,SMDWFQGQM,A*02,0.0,0.31699999999999995
CASSLVNGLGYTF,LLEDLDWDV,A*02,0.0,0.5680000000000001
CASSLVNGLGYTF,KLEFMDWRL,A*02,0.0,0.466
CASSLVNGLGYTF,VLEEVDWLI,A*02,0.0,0.652
CASSLVNGLGYTF,TLEELDWCL,A*02,0.0,0.618
CASSPSGLAGSNLGNEQFF,KMDYFSGQL,A*02,0.0,0.483
CASSPSGLAGSNLGNEQFF,MMDFFNAQM,A*02,0.0,0.748
CASSPSGLAGSNLGNEQFF,SMGVTYEM,A*02,0.0,0.18100000000000005
CASSPSGLAGSNLGNEQFF,GMDYHNGHL,A*02,0.0,0.263
CASSPSGLAGSNLGNEQFF,RLDAMNGQL,A*02,0.0,0.18400000000000005
CASSPSGLAGSNLGNEQFF,RMDYNNMQM,A*02,0.0,0.29300000000000004
CASSPSGLAGSNLGNEQFF,NLVPMVATV,A*02,0.0,0.9790000000000001
CASSPSGLAGSNLGNEQFF,LMDYWQGQL,A*02,0.0,0.9209999999999999
CASSPSGLAGSNLGNEQFF,YMGVSYEM,A*02,0.0,0.17400000000000004
CASSPSGLAGSNLGNEQFF,VLDLFQGQL,A*02,0.0,0.624
CASSPSGLAGSNLGNEQFF,NMMWFQGQL,A*02,0.0,0.9309999999999999
CASSPSGLAGSNLGNEQFF,LMDMHNGQL,A*02,0.0,0.49700000000000005
CASSPSGLAGSNLGNEQFF,TMDFYQGQL,A*02,0.0,0.767
CASSPSGLAGSNLGNEQFF,FMGVTYEM,A*02,0.0,0.16100000000000006
CASSPSGLAGSNLGNEQFF,YMGVVYEM,A*02,0.0,0.8029999999999999
CASSPSGLAGSNLGNEQFF,SMDWFQGQM,A*02,0.0,0.366
CASSPSGLAGSNLGNEQFF,KMGVTYEM,A*02,0.0,0.17400000000000004
CASSRDFVSNEQYF,WLDNFELCL,A*02,0.0,0.112
CASSRDFVSNEQYF,KLEQLDWTV,A*02,0.0,0.017000000000000015
CASSRDFVSNEQYF,RMEQVDWTV,A*02,0.0,0.06899999999999995
CASSRDFVSNEQYF,SMGVTYEM,A*02,0.0,0.5860000000000001
CASSRDFVSNEQYF,GMDYHNGHL,A*02,0.0,0.591
CASSRDFVSNEQYF,RLDAMNGQL,A*02,0.0,0.616
CASSRDFVSNEQYF,RMDYNNMQM,A*02,0.0,0.6459999999999999
CASSRDFVSNEQYF,TLEYMDWLV,A*02,0.0,0.06399999999999995
CASSRDFVSNEQYF,NLVPMVATV,A*02,0.0,0.03700000000000003
CASSRDFVSNEQYF,YMGVSYEM,A*02,0.0,0.6779999999999999
CASSRDFVSNEQYF,VLDLFQGQL,A*02,0.0,0.09699999999999998
CASSRDFVSNEQYF,NMEYMTWDV,A*02,0.0,0.13
CASSRDFVSNEQYF,EMMLFDWKV,A*02,0.0,0.263
CASSRDFVSNEQYF,LMDMHNGQL,A*02,0.0,0.09399999999999996
CASSRDFVSNEQYF,FMGVTYEM,A*02,0.0,0.611
CASSRDFVSNEQYF,TMETIDWKV,A*02,0.0,0.04900000000000005
CASSRDFVSNEQYF,NVEYYDIKL,A*02,0.0,0.08899999999999998
CASSRDFVSNEQYF,YMGVVYEM,A*02,0.0,0.17100000000000004
CASSRDFVSNEQYF,LLEDLDWDV,A*02,0.0,0.07399999999999995
CASSRDFVSNEQYF,KLEFMDWRL,A*02,0.0,0.11
CASSRDFVSNEQYF,KMGVTYEM,A*02,0.0,0.5660000000000001
CASSRDFVSNEQYF,VLEEVDWLI,A*02,0.0,0.10299999999999998
CASSRDFVSNEQYF,TLEELDWCL,A*02,0.0,0.09999999999999998
CASSRDTVNTEAFF,WLDNFELCL,A*02,0.0,0.782
CASSRDTVNTEAFF,KLEQLDWTV,A*02,0.0,0.937
CASSRDTVNTEAFF,RMEQVDWTV,A*02,0.0,0.96
CASSRDTVNTEAFF,SMGVTYEM,A*02,0.0,0.402
CASSRDTVNTEAFF,TLEYMDWLV,A*02,0.0,0.966
CASSRDTVNTEAFF,NLVPMVATV,A*02,0.0,0.95
CASSRDTVNTEAFF,YMGVSYEM,A*02,0.0,0.29200000000000004
CASSRDTVNTEAFF,NMEYMTWDV,A*02,0.0,0.997
CASSRDTVNTEAFF,EMMLFDWKV,A*02,0.0,0.914
CASSRDTVNTEAFF,TMDFYQGQL,A*02,0.0,0.36700000000000005
CASSRDTVNTEAFF,FMGVTYEM,A*02,0.0,0.276
CASSRDTVNTEAFF,TMETIDWKV,A*02,0.0,0.9059999999999999
CASSRDTVNTEAFF,NVEYYDIKL,A*02,0.0,0.0023999999999999573
CASSRDTVNTEAFF,YMGVVYEM,A*02,0.0,0.09199999999999996
CASSRDTVNTEAFF,LLEDLDWDV,A*02,0.0,0.7879999999999999
CASSRDTVNTEAFF,KLEFMDWRL,A*02,0.0,0.505
CASSRDTVNTEAFF,KMGVTYEM,A*02,0.0,0.411
CASSRDTVNTEAFF,VLEEVDWLI,A*02,0.0,0.8180000000000001
CASSRDTVNTEAFF,TLEELDWCL,A*02,0.0,0.454
57 changes: 45 additions & 12 deletions pMTnet/epitope_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@

class EpitopeDataSlim:

def __init__(self, epitopes, chain, folder, duplicate=True, predict=False, output='output',
prediction_path='prediction.csv'):
def __init__(self, chain, folder, file=None, epitopes=None, duplicate=True, predict=False, output='output',
duplicate_name=None, prediction_path='prediction.csv'):
"""
Args:
file (str): name of data file
chain (str): specifies which chain to use
epitopes (list): list of epitopes to plot ROC and predict
folder (str): path to folder of data file (now it has a default name)
Expand All @@ -21,7 +22,9 @@ def __init__(self, epitopes, chain, folder, duplicate=True, predict=False, outpu
"""
self.epitopes = epitopes
self.chain = chain
self.data = self.filter_data(self.read(f'{folder}/vdjdb.slim.txt'))
self.duplicate_name = duplicate_name
filename = f'{folder}/vdjdb.slim.txt' if not file else f'{folder}/{file}'
self.data = self.filter_data(self.read(filename))
self.duplicate = self.add_duplicates() if duplicate else None
self.make_prediction = predict
self.output = output
Expand Down Expand Up @@ -52,23 +55,27 @@ def filter_data(self, data, hla='HLA-A*02', species='HomoSapiens'):
Returns:
pd.DataFrame ready for pMTnet
"""

# this is a mapping to rename columns
mapper = {'cdr3': 'CDR3', 'antigen.epitope': 'Antigen', 'mhc.a': 'HLA'}

# filtering data by species, HLA and epitopes
data = data.loc[(data.species == species) & (data['antigen.epitope'].isin(self.epitopes)) &
(data['mhc.a'].str.startswith(hla))]
data = data.loc[(data.species == species) & (data['mhc.a'].str.startswith(hla))]

# if a set of epitopes was given, only they are left in the dataset
if self.epitopes:
data = data.loc[(data['antigen.epitope'].isin(self.epitopes))]
else:
self.epitopes = data['antigen.epitope'].unique()

# renaming columns
data = data[['cdr3', 'antigen.epitope', 'mhc.a']].rename(mapper=mapper, axis=1)

# removing 'HLA-' part as in testing_data.csv from pMTnet
# now keeping only primary part of HLA name
data.HLA = hla[-4:]
data.HLA = data.HLA.str[4:]

# removing duplicated data
data.drop_duplicates(subset=['CDR3', 'Antigen'], inplace=True)
data = data.drop_duplicates(subset=['CDR3', 'Antigen', 'HLA']).dropna(subset=['CDR3', 'Antigen'])
return data

def save_data(self, name, index=False):
Expand Down Expand Up @@ -99,17 +106,15 @@ def add_duplicates(self):

# else for each row with CDR - Antigen
for x in cdr_data.itertuples(index=False):

# a set of epitopes which are missing
# TODO: this step may add exhaustive duplicates which are removed in the end
rest_epitopes = set(self.epitopes) - {x.Antigen}

# for each epitope that is missing (see TODO)
for epitope in rest_epitopes:

# adding a dict with the current CDR3 and false epitope
duplicate_data.append({'CDR3': x.CDR3, 'Antigen': epitope,
'HLA': x.HLA})
'HLA': self.data.loc[self.data.Antigen == epitope].HLA.values[0]})
return pd.DataFrame(data=duplicate_data).drop_duplicates()

def predict(self, input_data):
Expand Down Expand Up @@ -140,6 +145,11 @@ def prepare_prediction(self):
# if there are duplicates prepared
if len(self.duplicate):

# removing duplicates from duplicates :) removing rows that are true (exist in self.data)
self.duplicate = self.duplicate.loc[
~self.duplicate.set_index(['CDR3', 'Antigen']).index.isin(
self.data.set_index(['CDR3', 'Antigen']).index)].dropna()

# setting label to 0 as these rows are false
self.duplicate['label'] = 0

Expand All @@ -156,7 +166,10 @@ def prepare_prediction(self):
self.data['label'] = 1

# creating a name to save the dataset
name = '_'.join(self.epitopes)
if not self.duplicate_name:
name = '_'.join(self.epitopes)
else:
name = self.duplicate_name
name = f'duplicate_{name}_{self.chain}.csv'
if not len(self.duplicate):
name = 'no_' + name
Expand Down Expand Up @@ -211,3 +224,23 @@ def roc(self):
ax.legend(loc='lower right')
i += 1
fig.tight_layout(rect=[0, 0.03, 1, 0.95])


class EpitopeDataVdjDB(EpitopeDataSlim):

def read(self, data):
"""
Reading data for one chain
Args:
data (str): path to data file
Returns:
pd.DataFrame with read data for one chain
"""
chain = 'alpha' if self.chain.lower().endswith('a') else 'beta'
data = pd.read_csv(data).dropna(subset=[f'cdr3.{chain}', 'antigen.epitope'])

# this is a mapping to rename columns
mapper = {f'cdr3.{chain}': 'cdr3'}

return data.rename(mapper=mapper, axis=1)
Loading

0 comments on commit af67ca9

Please sign in to comment.