NLM_MeSH_To_CSV/XML_Parser.py at master · ClarkBXu/NLM_MeSH_To_CSV · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 12:01:19 2020

@author: CBX033
"""

import csv
import urllib.request as request
import xml.etree.ElementTree as ET

def addElements(record,recordInfo,featureList):
    """
    Input: record - node from XML File,
                    recordInfo - dict of info about record,
                    featureList - list of features to parse
    Output: Updated recordInfo with text of features in featureList
    """
    for entry in featureList:
        try:
            #Locate feature in the record
            feature = record.find(entry)
            #Add feature text under feature tag to record info
            recordInfo[feature.tag] = feature.text.strip()
        except AttributeError:
            print(entry,'of type',type(entry),'in record',record,'is missing text method')
    return recordInfo

def loadXML(url):
    """
    Input: url - str for URL of XML File
    Returns: data - bytes object of XML File
    """
    response = urllib.request.urlopen(url)
    data = response.read()
    return data

def parseXML(data,recordAttribute,recordFeatures):
    """
    Input: data - bytes object of XML File,
           recordAttribute - attribute of the top node in the XML
           recordFeatures - features of the top node in the XML
    Returns: descriptorRecords - list of dict with record info
    """
    root = ET.fromstring(data)

    #Get Descriptor Record
    descriptorRecords = []
    for record in root:
        #Initialize dictionary of record info
        recordInfo = {}
        #Add record attribute to record info
        recordInfo[recordAttribute] = record.attrib
        #Update record features in record info
        recordInfo = addElements(record,recordInfo,recordFeatures)
        #Add record info to list of all records
        descriptorRecords.append(recordInfo)
    return descriptorRecords

def createCSV(data,headers,fileName):
    """
    Input: data - dict to read in,
           headers - list of csv headers,
           fileName - str for csv file name
    Output: Writes csv to directory under fileName
    """
    with open(fileName,mode='w',newline='',errors="ignore") as csvFile:
        #CSV writer from dict
        csvWriter = csv.DictWriter(csvFile,fieldnames=headers)
        #Writing headers
        csvWriter.writeheader()
        #Writing data
        csvWriter.writerows(data)
    return True

def main():
    meshData = loadXML(url='ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/xmlmesh/desc2020.xml')
    if meshData:
        print('XML loading complete')
    xmlData = parseXML(data=meshData,
                       recordAttribute='DescriptorClass',
                       recordFeatures=['DescriptorUI','NLMClassificationNumber','DescriptorName','DateCreated',
                                       'Annotation','AllowableQualifiersList','PharmacologicalActionList',
                                       'OnlineNote','HistoryNote','PublicMeSHNote'])
    if xmlData:
        print('XML parsing complete')
    complete = createCSV(data=xmlData,
                         headers=['DescriptorClass',
                                  'DescriptorUI','NLMClassificationNumber','DescriptorName','DateCreated',
                                  'Annotation','AllowableQualifiersList','PharmacologicalActionList',
                                  'OnlineNote','HistoryNote','PublicMeSHNote'],
                         fileName='MeSH_2020_Desc.csv')
    if complete:
        print("Script Done")

if __name__ == '__main__':
    main()