From 38c0534effa96737dd6e4801ed89bcf7f6db4932 Mon Sep 17 00:00:00 2001 From: jelly-FF <33385430+jelly-FF@users.noreply.github.com> Date: Thu, 3 May 2018 15:15:40 -0400 Subject: [PATCH] Update sklearn.affinity.py add recursive subcluster function. --- sna/renci/python/src/sklearn.affinity.py | 371 ++++++++++++++++++----- 1 file changed, 291 insertions(+), 80 deletions(-) diff --git a/sna/renci/python/src/sklearn.affinity.py b/sna/renci/python/src/sklearn.affinity.py index 7ef2bc6..1c40428 100755 --- a/sna/renci/python/src/sklearn.affinity.py +++ b/sna/renci/python/src/sklearn.affinity.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import sys, getopt import csv +import json import scipy.sparse from scipy.sparse import csr_matrix import numpy as np @@ -9,88 +10,298 @@ from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler +# maintain two versions of python codes def main(argv): - inputFile = '' - outputFile = '' - imax = 0 - jmax = 0 - inputFile = sys.argv[1] - outputFile = sys.argv[2] - if (len(sys.argv) < 4): - # pick a default value. - thisDamping = .92 - else: - # The third argument contains parameters in the format of key1:value1|key2:value2. In this - # case we are only expecting one: "damping" - paramList = sys.argv[3].split("|") - for thisParam in paramList: - # first and only parameter should be damping - paramSplit = thisParam.split(":") - if (paramSplit[0] == "damping"): - thisDamping = float(paramSplit[1]) - print 'Input file is:', inputFile - print 'Output file is:', outputFile - print 'thisDamping is:', str(thisDamping) - - - with open(inputFile, 'rb') as csvfile: - csvReader = csv.reader(csvfile, delimiter=',',quotechar='|') - # First line is the number of distinct nodes. - headerRows = csvReader.next() - imax = int(headerRows[0]) - jmax = int(headerRows[0]) - print str(imax) + " " + str(jmax) - - # define the matrix - simMatrix = np.zeros((imax, jmax), dtype=np.float) - currentNodeIndex = 0 - # We build a map between the matrix we want to build and the node identifiers - # as we read in the rows. - thisI = 0 - thisJ = 0 - nodeMap = dict() - - # we also want a list that maps the indices to the node names - indexList = list() - for row in csvReader: - if (row[0] in nodeMap): - thisI = nodeMap[row[0]] - else: - nodeMap[row[0]] = currentNodeIndex - indexList.append(row[0]) - currentNodeIndex += 1 - - if (row[1] in nodeMap): - thisJ = nodeMap[row[1]] - else: - nodeMap[row[1]] = currentNodeIndex - indexList.append(row[1]) - currentNodeIndex += 1 - - # matrix is symetric - simMatrix[thisI, thisJ] = float(row[2]) - simMatrix[thisJ, thisI] = float(row[2]) - - for i in range(0,imax): - # Set all of the diagonals to 1 - simMatrix[i,i] = 1. - - db = AffinityPropagation(affinity='precomputed',damping=thisDamping) - labels = db.fit_predict(simMatrix) - - # Number of clusters in labels, ignoring noise if present. - n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - - #print 'Estimated number of clusters: %d' % n_clusters_ - print labels, len(labels) - - with open(outputFile, 'wb') as csvoutfile: - csvWriter = csv.writer(csvoutfile, delimiter=',',quotechar='|') - for i in range(0, imax): - csvWriter.writerow([indexList[i], labels[i]]) + if (sys.version_info > (3, 0)): + # Python 3 code in this block + inputFile = 'D:/F/UnivSub/UNC/internship/RENCI/DataBridge/cluster_0301/data/test/DataBridge-input-small.csv' + outputFile = 'D:/F/UnivSub/UNC/internship/RENCI/DataBridge/cluster_0301/data/test/DataBridge-output-small_0329.csv' + outputFile_network = 'D:/F/UnivSub/UNC/internship/RENCI/DataBridge/cluster_0301/data/test/DataBridge-network-small.json' + imax = 0 + jmax = 0 + # inputFile = sys.argv[1] + # outputFile = sys.argv[2] + if (len(sys.argv) < 4): + # pick a default value. + thisDamping = .92 + else: + # The third argument contains parameters in the format of key1:value1|key2:value2. In this + # case we are only expecting one: "damping" + paramList = sys.argv[3].split("|") + for thisParam in paramList: + # first and only parameter should be damping + paramSplit = thisParam.split(":") + if (paramSplit[0] == "damping"): + thisDamping = float(paramSplit[1]) + print('Input file is:', inputFile) + print('Output file is:', outputFile) + print('thisDamping is:', str(thisDamping)) + + networkdict = dict() + networkdict['nodes'] = list() + networkdict['links'] = list() + + with open(inputFile, newline='') as csvfile: + csvReader = csv.reader(csvfile, delimiter=',', quotechar='|') + # First line is the number of distinct nodes. + headerRows = next(csvReader) + imax = int(headerRows[0]) + jmax = int(headerRows[0]) + print(str(imax) + " " + str(jmax)) + + # define the matrix + simMatrix = np.zeros((imax, jmax), dtype=np.float) + currentNodeIndex = 0 + # We build a map between the matrix we want to build and the node identifiers + # as we read in the rows. + # and the clusters assigned in all the levels + thisI = 0 + thisJ = 0 + nodeMap = dict() + + # we also want a list that maps the indices to the node names + indexList = list() + for row in csvReader: + if (row[0] in nodeMap): + pass + else: + nodeMaplist = list() + nodeMaplist.append(currentNodeIndex) + nodeMap[row[0]] = nodeMaplist + indexList.append(row[0]) + currentNodeIndex += 1 + thisI = nodeMap[row[0]][0] + + if (row[1] in nodeMap): + pass + else: + nodeMaplist = list() + nodeMaplist.append(currentNodeIndex) + nodeMap[row[1]] = nodeMaplist + indexList.append(row[1]) + currentNodeIndex += 1 + thisJ = nodeMap[row[1]][0] + + # matrix is symetric + simMatrix[thisI, thisJ] = float(row[2]) + simMatrix[thisJ, thisI] = float(row[2]) + + linkdict = dict() + linkdict['source'] = thisI + linkdict['target'] = thisJ + linkdict['value'] = float(row[2]) + networkdict['links'].append(linkdict) + + for i in range(0, imax): + # Set all of the diagonals to 1 + simMatrix[i, i] = 1. + + # do the clustering + db = AffinityPropagation(affinity='precomputed', damping=thisDamping) + labels = db.fit_predict(simMatrix) + + # Number of clusters in labels, ignoring noise if present. + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + + print('Estimated number of clusters: %d' % n_clusters_) + + # if there are more than one cluster + if n_clusters_ > 1: + print(labels, len(labels)) + + # save cluster labels to each node in nodeMap + for i in range(0, len(labels)): + nodeMapKey = indexList[i] + nodeMap[nodeMapKey].append(labels[i]) + + clusterNodeList0 = list(indexList) + # do clustering for the subgroup and update the nodeMap object + nodeMap = subcluster(labels, clusterNodeList0, nodeMap, simMatrix, thisDamping) + + with open(outputFile, 'w', newline='') as csvoutfile: + csvWriter = csv.writer(csvoutfile, delimiter=',', quotechar='|') + # output all the nodes + for i in range(0, imax): + writerowlist = list() + nodeMapKey = indexList[i] + writerowlist.append(nodeMapKey) + writerowlist = writerowlist + nodeMap[nodeMapKey][1:] + csvWriter.writerow(writerowlist) + nodedict = dict() + nodedict['name'] = nodeMapKey + nodedict['title'] = nodeMapKey + nodedict['group'] = nodeMap[nodeMapKey][1] + nodedict['subgroup'] = nodeMap[nodeMapKey][2:] + nodedict['URL'] = nodeMapKey + nodedict['description'] = nodeMapKey + networkdict['nodes'].append(nodedict) + with open(outputFile_network, 'w') as f: + json.dump(networkdict, f, sort_keys=True, indent=2, default=str) + else: + # Python 2 code in this block + # inputFile = 'D:/F/UnivSub/UNC/internship/RENCI/DataBridge/cluster_0301/data/test/DataBridge-input-small.csv' + # outputFile = 'D:/F/UnivSub/UNC/internship/RENCI/DataBridge/cluster_0301/data/test/DataBridge-output-small.csv' + imax = 0 + jmax = 0 + inputFile = sys.argv[1] + outputFile = sys.argv[2] + if (len(sys.argv) < 4): + # pick a default value. + thisDamping = .92 + else: + # The third argument contains parameters in the format of key1:value1|key2:value2. In this + # case we are only expecting one: "damping" + paramList = sys.argv[3].split("|") + for thisParam in paramList: + # first and only parameter should be damping + paramSplit = thisParam.split(":") + if (paramSplit[0] == "damping"): + thisDamping = float(paramSplit[1]) + # print 'Input file is:', inputFile + # print 'Output file is:', outputFile + # print 'thisDamping is:', str(thisDamping) + + networkdict = dict() + networkdict['nodes'] = list() + networkdict['links'] = list() + + with open(inputFile, 'rb') as csvfile: + csvReader = csv.reader(csvfile, delimiter=',', quotechar='|') + # First line is the number of distinct nodes. + headerRows = csvReader.next() + imax = int(headerRows[0]) + jmax = int(headerRows[0]) + # print str(imax) + " " + str(jmax) + + # define the matrix + simMatrix = np.zeros((imax, jmax), dtype=np.float) + currentNodeIndex = 0 + # We build a map between the matrix we want to build and the node identifiers + # as we read in the rows. + thisI = 0 + thisJ = 0 + nodeMap = dict() + + # we also want a list that maps the indices to the node names + indexList = list() + for row in csvReader: + if (row[0] in nodeMap): + pass + else: + nodeMaplist = list() + nodeMaplist.append(currentNodeIndex) + nodeMap[row[0]] = nodeMaplist + indexList.append(row[0]) + currentNodeIndex += 1 + thisI = nodeMap[row[0]][0] + + if (row[1] in nodeMap): + pass + else: + nodeMaplist = list() + nodeMaplist.append(currentNodeIndex) + nodeMap[row[1]] = nodeMaplist + indexList.append(row[1]) + currentNodeIndex += 1 + thisJ = nodeMap[row[1]][0] + + # matrix is symetric + simMatrix[thisI, thisJ] = float(row[2]) + simMatrix[thisJ, thisI] = float(row[2]) + + linkdict = dict() + linkdict['source'] = thisI + linkdict['target'] = thisJ + linkdict['value'] = float(row[2]) + networkdict['links'].append(linkdict) + + for i in range(0, imax): + # Set all of the diagonals to 1 + simMatrix[i, i] = 1. + + # do the clustering + db = AffinityPropagation(affinity='precomputed', damping=thisDamping) + labels = db.fit_predict(simMatrix) + + # Number of clusters in labels, ignoring noise if present. + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + + # print 'Estimated number of clusters: %d' % n_clusters_ + + # if there are more than one cluster + if n_clusters_ > 1: + # print labels, len(labels) + + # save cluster labels to each node in nodeMap + for i in range(0, len(labels)): + nodeMapKey = indexList[i] + nodeMap[nodeMapKey].append(labels[i]) + + clusterNodeList0 = list(indexList) + # do clustering for the subgroup and update the nodeMap object + nodeMap = subcluster(labels, clusterNodeList0, nodeMap, simMatrix, thisDamping) + + with open(outputFile, 'wb') as csvoutfile: + csvWriter = csv.writer(csvoutfile, delimiter=',', quotechar='|') + for i in range(0, imax): + # csvWriter.writerow([indexList[i], labels[i]]) + writerowlist = list() + nodeMapKey = indexList[i] + writerowlist.append(nodeMapKey) + writerowlist = writerowlist + nodeMap[nodeMapKey][1:] + csvWriter.writerow(writerowlist) + nodedict = dict() + nodedict['name'] = nodeMapKey + nodedict['title'] = nodeMapKey + nodedict['group'] = nodeMap[nodeMapKey][1] + nodedict['subgroup'] = nodeMap[nodeMapKey][2:] + nodedict['URL'] = nodeMapKey + nodedict['description'] = nodeMapKey + networkdict['nodes'].append(nodedict) + +# function to do clustering for the subgroup +def subcluster(labels, clusterNodeList0, nodeMap, simMatrix, thisDamping): + # go over all the subgroups + for i in range(0, max(labels) + 1): + labelIndex = 0 + # build the list of the node identifiers in the subgroup + clusterNodeList = list() + for node in labels: + if (node == i): + nodeMapKey = clusterNodeList0[labelIndex] + clusterNodeList.append(nodeMapKey) + # nodeMap[nodeMapKey].append(node) + labelIndex += 1 + + # if the number of nodes in the cluster is over the threshold + if (len(clusterNodeList) > 5): + imax_i = len(clusterNodeList) + jmax_i = len(clusterNodeList) + # build new simMatrix_i and find similarity value in the simMatrix + simMatrix_i = np.zeros((imax_i, jmax_i), dtype=np.float) + for ii in range(0, imax_i): + indexii = nodeMap[clusterNodeList[ii]][0] + for jj in range(0, jmax_i): + indexjj = nodeMap[clusterNodeList[jj]][0] + simMatrix_i[ii, jj] = simMatrix[indexii, indexjj] + # do the clustering + db = AffinityPropagation(affinity='precomputed', damping=thisDamping) + labels_i = db.fit_predict(simMatrix_i) + + # if there are more than one cluster, yes do the recursion again + n_clusters_ = len(set(labels_i)) - (1 if -1 in labels_i else 0) + print('Estimated number of clusters: %d' % n_clusters_) + + if n_clusters_ > 1: + # update nodeMap by new labels + for ii in range(0, len(labels_i)): + nodeMapKey = clusterNodeList[ii] + nodeMap[nodeMapKey].append(labels_i[ii]) + + nodeMap = subcluster(labels_i, clusterNodeList, nodeMap, simMatrix, thisDamping) + + return nodeMap if __name__ == "__main__": main(sys.argv[1:]) - -