Scrape/GetISCED.py at master · mobcdi/Scrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
'''
Created on 7 Jan 2016

@author: michael.obrien
'''

import os
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import unicodecsv as csv
import re
from _csv import QUOTE_NONE


def main(OutputFileName="DITISCEDCodes.csv", FileDelimiter=";"):
    #Open the file for writing
    MyCSVFile = open(OutputFileName, "wb")
    DITCodesFileHandle = csv.writer(MyCSVFile, delimiter=FileDelimiter,quoting = csv.QUOTE_NONE,quotechar='')

    #Structure of the
    #<span class="inline_label">
    #ISCED:
    #</span>
    #<span class="inline_value">
    #0410: Business &amp; Admin not defined
    #</span>
    #Strain using #div class="progmod_content" so the rest of the html file isn't parsed by BS

    #Write the header row
    DITCodesFileHandle.writerow(['Course', 'ISCEDCode'])

    ISCEDStrainer = SoupStrainer("div",class_="progmod_content")
    #Process the files in the folder
    for fileToProcess in os.listdir(os.curdir):
        #Get all the html files
        if fileToProcess.endswith(".html"):
            #Open the file into BS, straining it so only the section we want is available,
            WebText = BeautifulSoup(open(fileToProcess),"html.parser",parse_only=ISCEDStrainer)
            #print(WebText.prettify(formatter = "html"))
            #Because there is no unique way to get the ISCED use this from Stackoverflow to
            #Use the ISCED code to up back up the parse tree to the parent then back to the next sibling
            for el in WebText(text=re.compile(r'ISCED')):
                myText=el.parent.find_next_sibling().get_text()
                #Print the filename without the extension and the ISCED code found
                #Remove the extra characters that come in with BSoup text
                myText=myText.strip()
               # myText=myText.strip()
                Name = str(os.path.splitext(fileToProcess)[0])
                print(Name, myText)
                #Write the value to file
                DITCodesFileHandle.writerow([Name,myText])
                MyCSVFile.flush()
            #myText = re.findall("ISCED:(.*)",WebText.prettify())
            #myText= re.compile("ISCED:(.*)")

    #close the file after processing the folder
    print("Finished processing the files")
    MyCSVFile.close

if __name__ == '__main__':
    main()