-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathcombineRelevantMeSH.py
More file actions
30 lines (21 loc) · 880 Bytes
/
combineRelevantMeSH.py
File metadata and controls
30 lines (21 loc) · 880 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import argparse
import json
from tqdm import tqdm
import os
import gzip
if __name__ == '__main__':
parser = argparse.ArgumentParser('Combine all the MeSH data into a single file')
parser.add_argument('--inDir',required=True,type=str,help='Input directory of MeSH terms')
parser.add_argument('--outJSONGZ',required=True,type=str,help='JSON GZ file with PMIDs mapped to relevant MeSH terms')
args = parser.parse_args()
pmids = set()
print("Loaded PMIDs from corpus file...")
filenames = sorted( f for f in os.listdir(args.inDir) if f.endswith('.json.gz') )
pmidToMesh = {}
for filename in tqdm(filenames):
with gzip.open(os.path.join(args.inDir,filename),'rt') as f:
tmp = json.load(f)
pmidToMesh.update(tmp)
print("Combined %d PubMed ID(s) with relevant MeSH terms" % len(pmidToMesh))
with gzip.open(args.outJSONGZ,'wt') as f:
json.dump(pmidToMesh,f)