-
Notifications
You must be signed in to change notification settings - Fork 1
/
queryPubMed.py
128 lines (115 loc) · 4.31 KB
/
queryPubMed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
##
import urllib
import os
import random
import csv
#from processData import Process
#import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from Bio import Medline
from Bio import Entrez
import math
import numpy as np
import matplotlib.pyplot as plt
Entrez.email = 'zhiguo.yu@uth.tmc.edu'
def queryPubMed(mesh, docpath):
if not os.path.isdir(docpath):
os.makedirs(docpath)
if len(mesh)>1:
query = '"'+mesh[0]+'"[MeSH Terms]'+' OR '+'"'+mesh[1]+'"[MeSH Terms]'
else:
query = '"'+mesh[0]+'"[MeSH Terms]'
print query
esearch = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&mindate=1945&maxdate=2016&retmode=xml&retmax=10000000&term=%s' % (query)
handle = urllib.urlopen(esearch)
data = handle.read()
#print data
root = ET.fromstring(data)
ids = [x.text for x in root.findall("IdList/Id")]
## random choose 10,100,1000 docs five times
# 10 docs
print 'Got %d articles' % (len(ids))
'''print 'Randomly Retrieving 10 docs five times'
for j in range(5):
print j
f1 = open(docpath+'/docs_10_'+str(j)+'.txt','w')
f2 = open(docpath+'/meshes_10_'+str(j)+'.txt','w')
idlist = random.sample(ids,10)
handle = Entrez.efetch(db="pubmed",id=idlist,rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records)
for record in records:
if record.get("MH",""):
for item in record.get("MH",""):
f2.writelines(item)
f2.write('|')
f2.write("\n")
f1.writelines(record.get("TI",""))
f1.writelines(record.get("BTI",""))
# print "Abstract:",record.get("AB","")
f1.writelines(record.get("AB",""))
f1.write("\n")
f1.close()
f2.close()
# 100 docs
print 'Randomly Retrieving 100 docs five times'
for j in range(5):
print j
f1 = open(docpath+'/docs_100_'+str(j)+'.txt','w')
f2 = open(docpath+'/meshes_100_'+str(j)+'.txt','w')
idlist = random.sample(ids,100)
handle = Entrez.efetch(db="pubmed",id=idlist,rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records)
for record in records:
if record.get("MH",""):
for item in record.get("MH",""):
f2.writelines(item)
f2.write('|')
f2.write("\n")
f1.writelines(record.get("TI",""))
f1.writelines(record.get("BTI",""))
# print "Abstract:",record.get("AB","")
f1.writelines(record.get("AB",""))
f1.write("\n")
f1.close()
f2.close()'''
# 1000 docs
print 'Randomly Retrieving 3000 docs five times'
for j in range(5):
print j
f1 = open(docpath+'/docs_3000_'+str(j)+'.txt','w')
f2 = open(docpath+'/meshes_3000_'+str(j)+'.txt','w')
idlist = random.sample(ids,3000)
for sub_id in range(0,3000,100):
handle = Entrez.efetch(db="pubmed",id=idlist[sub_id:sub_id+100],rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records)
for record in records:
if record.get("MH",""):
for item in record.get("MH",""):
f2.writelines(item)
f2.write('|')
f2.write("\n")
f1.writelines(record.get("TI",""))
f1.writelines(record.get("BTI",""))
# print "Abstract:",record.get("AB","")
f1.writelines(record.get("AB",""))
f1.write("\n")
f1.close()
f2.close()
def query_MeSH():
meshes =[]
for line in open('mapped_mesh_pairs.txt','r'):
line = line.strip().split(':')
for item in line:
if item not in meshes:
if '/' in item:
meshes.append(item.split('/'))
else:
meshes.append([item])
for mesh in meshes:
print mesh
docpath = './'+str(mesh)
queryPubMed(mesh, docpath)
query_MeSH()