-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_hallmark_abstracts.py
76 lines (58 loc) · 2.27 KB
/
get_hallmark_abstracts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from __future__ import print_function
from Bio import Entrez
from Bio import Medline
from Bio.Entrez import efetch, read
from pandas import *
Entrez.email = '[email protected]'
max_res = 500
hallmark_queries = ['proliferation receptor',
'growth factor',
'cell cycle',
'contact inhibition',
'apoptosis',
'necrosis',
'autophagy',
'senescence',
'immortalization',
'angiogenesis',
'angiogenic factor',
'metastasis',
'mutation',
'DNA repair',
'adducts',
'DNA damage',
'inflammation',
'oxidative stress',
'warburg effect',
'growth',
'activation',
'immune system']
final_list = list()
debug_index = len(hallmark_queries)-1
'''handle = Entrez.esearch(db='pubmed', term = hallmark_queries[0] , retmax = max_res)
record = Entrez.read(handle)
handle.close()
idlist = record["IdList"]
handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records)
for record in records:
if 'cancer' in record.get("AB", "?"):
final_list.append([record.get("PMID", "?"), record.get("TI", "?"), "https://www.ncbi.nlm.nih.gov/pubmed/?term=" +
record.get("PMID", "?")])'''
for i in range(0, len(hallmark_queries)-1):
handle = Entrez.esearch(db='pubmed', term = hallmark_queries[i] , retmax = max_res)
record = Entrez.read(handle)
handle.close()
idlist = record["IdList"]
handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records)
for record in records:
if 'cancer' in record.get("AB", "?"):
final_list.append([record.get("PMID", "?"), record.get("TI", "?"), record.get("AB", "?")])
final_df = pandas.DataFrame(final_list)
column_names = ["PMID", "Title", "Abstract"]
final_df.columns = column_names
print(final_df)
final_df.to_csv('training_set.csv', sep=',',encoding = 'utf-8')