DataTraining/headers.py at master · DataPlumbers/DataTraining · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import csv
import spacys_mom as spm
import wordninja as wj
import numpy as np

### SPACY STUFF

nlp = spm.SpacyWrapper()

def find_similar(input, filepath):
    category = input[0]
    properties = input[1]
    # Load spaCy's NLP dictionaries
    # Slice and Lemmatize properties
    properties_sl = []
    for prop in properties:
        prop_sl = split_and_lemmatize(prop)
        properties_sl.append(prop_sl)
    # Get ALL column headers from file
    headers = get_headers(filepath)
    # Slice and Lemmatize all headers
    headers_sl = []
    for header in headers:
        header_sl = split_and_lemmatize(header)
        headers_sl.append(header_sl)
    # Compare each property to each header
    classification = {}
    for index in range(len(properties)):
        classification[properties[index]] = cmp_prop_to_headers(properties[index], headers, properties_sl[index], headers_sl)
    # Populate dict and return
    results = {}
    results["category"] = category
    results["classification"] = classification
    return results

### Comparison Functions

# NOTE: Both prop_sl and headers_sl may be broken
#       into multiple English words, so they are
#       ARRAYS not STRINGS
def cmp_prop_to_headers(orig_prop, orig_headers, prop_sl, headers_sl):
    # Loop over each property's words
    related_headers = []
    for index in range(len(orig_headers)):
        header_sl = headers_sl[index]
        is_related = cmp_prop_to_header(prop_sl, header_sl)
        if is_related:
            related_headers.append(orig_headers[index])
    return related_headers

def cmp_prop_to_header(prop_sl, header_sl):
    means_all = []
    for word in prop_sl:
        word_spacy = nlp.process(word)
        cmp_values = []
        for header in header_sl:
            header_spacy = nlp.process(header)
            sim_value = nlp.compare(word_spacy, header_spacy)
            # If a lemmatized word from both header and property
            # are very closely related, return it regardless
            # of overall average.
            if sim_value > 0.95:
                return True
            else:
                cmp_values.append(sim_value)
        mean_hdr = np.mean(cmp_values)
        means_all.append(mean_hdr)
    mean_all = np.mean(means_all)
    # TODO: Strictness of "relatedness" needs tweaking.
    if mean_all > 0.8:
        return True
    else:
        return False

### Util Functions

def split_and_lemmatize(input):
    split_words = slice_word(input)
    input_lemma = []
    for word in split_words:
        word_tok = nlp.process(word)
        word_lemma = lemmatize_word(word_tok)
        input_lemma.append(word_lemma)
    return input_lemma

# Params: input = a spaCy token
# Return: String
# EX: "reviews" -> "review"
# EX: "thought" -> "think"
def lemmatize_word(input):
    return input[0].lemma_

# Params: input = given String
# Return: [sub-word1, sub-word2, ...]
# EX: "review_date" -> ['review', 'date']
# EX: "reviewernameslast" -> ['reviewer', 'names', 'last']
def slice_word(input):
    return wj.split(input)

# Params: file = path to given CSV file as a String
# Return: [ header1, header2, ... ]
def get_headers(file):
    headers = []
    with open(file) as f:
        reader = csv.reader(f)
        headers = next(reader)
    return headers