From 93e597dba2788f01931ba3f164aec56380aceee1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 4 Aug 2019 17:17:19 +0000 Subject: [PATCH 1/9] Ported to the latest gensim, output extended with .mat format --- requirements.txt | 4 ++-- src/graph_coarsening.py | 42 ++++++++++++++++++++--------------------- src/harp.py | 8 ++++++-- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5b8b4e4..a3ab3a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -gensim==0.13.1 -scipy==0.19.1 +gensim>=3.1.0 +scipy>=0.19.1 wheel>=0.23.0 Cython>=0.20.2 argparse>=1.2.1 diff --git a/src/graph_coarsening.py b/src/graph_coarsening.py index 139f7d4..62cad1c 100644 --- a/src/graph_coarsening.py +++ b/src/graph_coarsening.py @@ -328,8 +328,8 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer sg=1, hs=0) - for ind, vec in enumerate(gc_model[-1].syn0): - real_ind = reversed_mapping[int(gc_model[-1].index2word[ind])] + for ind, vec in enumerate(gc_model[-1].wv.syn0): + real_ind = reversed_mapping[int(gc_model[-1].wv.index2word[ind])] embeddings[real_ind] = vec return embeddings @@ -391,10 +391,10 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs): model = skipgram.Word2Vec_hs_loss(None, sg=kwargs['sg'], size=kwargs['representation_size'], iter=kwargs['iter'][level], window=kwargs['window_size'], sample=sample, alpha=alpha_list[level], min_alpha=min_alpha_list[level]) # copy vocab / index2word from the coarser graph - model.vocab = copy.deepcopy(models[-1].vocab) - model.index2word = copy.deepcopy(models[-1].index2word) - model.syn0 = copy.deepcopy(models[-1].syn0) - model.syn0.resize(recursive_graphs[level].number_of_nodes(), kwargs['representation_size']) + model.wv.vocab = copy.deepcopy(models[-1].wv.vocab) + model.wv.index2word = copy.deepcopy(models[-1].wv.index2word) + model.wv.syn0 = copy.deepcopy(models[-1].wv.syn0) + model.wv.syn0.resize(recursive_graphs[level].number_of_nodes(), kwargs['representation_size']) model.syn0norm = None model.corpus_count = len(edges) @@ -414,22 +414,22 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs): node_pool = [node, merged_node] prev_node = node - cur_index = len(models[-1].vocab) + cur_index = len(models[-1].wv.vocab) for node, merged_node in changed_merged_nodes: if node == merged_node: continue str_node, str_merged_node = str(node), str(merged_node) - word_index = model.vocab[str_merged_node].index - init_vec = model.syn0[word_index] + word_index = model.wv.vocab[str_merged_node].index + init_vec = model.wv.syn0[word_index] model.add_word(str_node, str_merged_node, init_vec, cur_index) cur_index += 1 model.add_word(str_merged_node, str_merged_node, init_vec, cur_index) - model.syn1 = np.zeros((len(model.vocab), model.layer1_size), dtype=np.float32) + model.syn1 = np.zeros((len(model.wv.vocab), model.layer1_size), dtype=np.float32) for i in range(len(models[-1].syn1)): model.syn1[i] = models[-1].syn1[i] - model.syn0_lockf = np.ones(len(model.vocab), dtype=np.float32) - model.train(edges) + model.syn0_lockf = np.ones(len(model.wv.vocab), dtype=np.float32) + model.train(edges, total_examples=model.corpus_count, epochs=model.iter) models.append(model) @@ -479,27 +479,27 @@ def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, **kwargs): else: model = Word2Vec(None, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=20) model.build_vocab(edges) - model.reset_weights() + #model.reset_weights() # Does not present in the latest gensim and there is no sense to reset weight in the non-trained model # init model weights with the previous one - prev_syn0 = {models[-1].index2word[ind]: vec for ind, vec in enumerate(models[-1].syn0)} - prev_syn1neg = {models[-1].index2word[ind]: vec for ind, vec in enumerate(models[-1].syn1neg)} - word2index = {model.index2word[ind]: ind for ind in range(recursive_graphs[level].number_of_nodes())} + prev_syn0 = {models[-1].wv.index2word[ind]: vec for ind, vec in enumerate(models[-1].wv.syn0)} + prev_syn1neg = {models[-1].wv.index2word[ind]: vec for ind, vec in enumerate(models[-1].syn1neg)} + word2index = {model.wv.index2word[ind]: ind for ind in range(recursive_graphs[level].number_of_nodes())} for ind in range(recursive_graphs[level].number_of_nodes()): - word = model.index2word[ind] + word = model.wv.index2word[ind] if word in prev_syn0: - model.syn0[ind] = prev_syn0[word] + model.wv.syn0[ind] = prev_syn0[word] model.syn1neg[ind] = prev_syn1neg[word] else: # if a is merged into b, then a should has identical weights in word2vec as b if int(word) in recursive_merged_nodes[level]: word_ind = word2index[word] merged_word = str(recursive_merged_nodes[level][int(word)]) - model.syn0[word_ind] = prev_syn0[merged_word] + model.wv.syn0[word_ind] = prev_syn0[merged_word] model.syn1neg[word_ind] = prev_syn1neg[merged_word] - model.syn0_lockf = np.ones(len(model.vocab), dtype=np.float32) + model.syn0_lockf = np.ones(len(model.wv.vocab), dtype=np.float32) - model.train(edges) + model.train(edges, total_examples=model.corpus_count, epochs=model.iter) models.append(model) diff --git a/src/harp.py b/src/harp.py index 2949205..5f23620 100644 --- a/src/harp.py +++ b/src/harp.py @@ -7,7 +7,7 @@ from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter from magicgraph import WeightedDiGraph, WeightedNode -from scipy.io import mmread, mmwrite, loadmat +from scipy.io import mmread, mmwrite, loadmat, savemat import graph_coarsening @@ -71,7 +71,11 @@ def main(): sfdp_path=args.sfdp_path, representation_size=64,window_size=1, lr_scheme='default',alpha=0.025,min_alpha=0.001,sg=1,hs=0,sample=0.001) - np.save(args.output, embeddings) + + if args.output.endswith('.mat'): + savemat(args.output, mdict={'embs': embeddings}) + else: + np.save(args.output, embeddings) if __name__ == '__main__': sys.exit(main()) From c693362c75a65653d15a854e67b2d4ddb2e8b0b0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 4 Aug 2019 17:32:33 +0000 Subject: [PATCH 2/9] Representation size fixed for the 'line' model --- src/harp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harp.py b/src/harp.py index 5f23620..b05cdf1 100644 --- a/src/harp.py +++ b/src/harp.py @@ -69,7 +69,7 @@ def main(): elif args.model == 'line': embeddings = graph_coarsening.skipgram_coarsening_disconnected(G,scale=1, iter_count=50, sfdp_path=args.sfdp_path, - representation_size=64,window_size=1, + representation_size=args.representation_size,window_size=1, lr_scheme='default',alpha=0.025,min_alpha=0.001,sg=1,hs=0,sample=0.001) if args.output.endswith('.mat'): From acc6f870b374d7bfb463c71063d27ad0a029ae62 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 5 Aug 2019 06:48:46 +0000 Subject: [PATCH 3/9] Proted to the latest deepwalk, workers parameter is considered --- requirements.txt | 4 ++-- src/baseline.py | 10 ++++++--- src/graph_coarsening.py | 35 ++++++++++++++++------------- src/harp.py | 3 ++- src/scoring.py | 2 +- src/skipgram.py | 49 +++++++++++++++++++++++++---------------- 6 files changed, 62 insertions(+), 41 deletions(-) diff --git a/requirements.txt b/requirements.txt index a3ab3a3..3a13de0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -gensim>=3.1.0 +gensim>=1.0.0 scipy>=0.19.1 wheel>=0.23.0 Cython>=0.20.2 @@ -6,4 +6,4 @@ argparse>=1.2.1 futures>=2.1.6 six>=1.7.3 psutil>=2.1.1 -deepwalk<=1.0.2 +deepwalk>=1.0.3 diff --git a/src/baseline.py b/src/baseline.py index e945e6b..ed433e0 100644 --- a/src/baseline.py +++ b/src/baseline.py @@ -1,7 +1,10 @@ -from gensim.models import Word2Vec +from gensim.models.word2vec import Word2Vec +from multiprocessing import cpu_count import graph_coarsening import numpy as np +_WORKERS = 1 + int(cpu_count() / 2) # At least half of the available logical cores + def skipgram_baseline(graph, **kwargs): scale = kwargs.get('scale', -1) representation_size = kwargs.get('representation_size', 128) @@ -14,12 +17,13 @@ def skipgram_baseline(graph, **kwargs): output = kwargs.get('output', 'default') edges = graph_coarsening.build_deepwalk_corpus(graph, num_paths, path_length, output) + workers=kwargs.get('workers', _WORKERS) if kwargs['hs'] == 0: print ('Training the Negative Sampling Model...') - model = Word2Vec(edges, size=representation_size, window=kwargs['window_size'], min_count=0, sg=1, hs=0, iter=kwargs['iter_count'], negative=5, workers=20) + model = Word2Vec(edges, size=representation_size, window=kwargs['window_size'], min_count=0, sg=1, hs=0, iter=kwargs['iter_count'], negative=5, workers=workers) else: print ('Training the Hierarchical Softmax Model...') - model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sg=1, hs=1, iter=kwargs['iter_count'], workers=20) + model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sg=1, hs=1, iter=kwargs['iter_count'], workers=workers) print ('Finish training the Skip-gram model.') return model diff --git a/src/graph_coarsening.py b/src/graph_coarsening.py index 62cad1c..b5ac6ee 100644 --- a/src/graph_coarsening.py +++ b/src/graph_coarsening.py @@ -13,16 +13,20 @@ import baseline import utils import numpy as np +import six from collections import defaultdict, deque from concurrent.futures import ProcessPoolExecutor from deepwalk import walks as serialized_walks -from gensim.models import Word2Vec +from gensim.models.word2vec import Word2Vec from magicgraph import WeightedDiGraph, WeightedNode from scipy.io import mmread, mmwrite +from multiprocessing import cpu_count + +_WORKERS = 1 + int(cpu_count() / 2) # At least half of the available logical cores class DoubleWeightedDiGraph(WeightedDiGraph): - def __init__(self, init_graph = None): + def __init__(self, init_graph=None): super(WeightedDiGraph, self).__init__(node_class=WeightedNode) self.weighted_nodes = magicgraph.WeightedNode() if init_graph is not None: @@ -246,7 +250,7 @@ def external_ec_coarsening(graph, sfdp_path, coarsening_scheme=2): return recursive_graphs, recursive_merged_nodes -def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_merged_nodes=None, **kwargs): +def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_merged_nodes=None, workers=_WORKERS, **kwargs): print (kwargs) if graph.is_connected(): print ('Connected graph.') @@ -276,6 +280,7 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer if not subgraph.is_connected(): gc_single_model = baseline.skipgram_baseline(subgraph, + workers=workers, scale=scale, num_paths=num_paths, path_length=path_length, @@ -295,7 +300,7 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer recursive_graphs, recursive_merged_nodes = external_ec_coarsening(subgraph, sfdp_path) iter_counts = [iter_count for _ in range(len(recursive_graphs))] if hs == 1: - gc_model = skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, + gc_model = skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, workers=workers, scale=scale, iter=iter_counts, num_paths=num_paths, @@ -312,7 +317,7 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer sample=sample) else: print ('Training negative sampling model...') - gc_model = skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, + gc_model = skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, workers=workers, scale=scale, iter=iter_counts, num_paths=num_paths, @@ -342,7 +347,7 @@ def gen_alpha(init_alpha, recursive_graphs, iter_counts): alpha_list.append(init_alpha * 1. * cur_iter_count / total_iter_count) return alpha_list -def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs): +def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, workers=_WORKERS, **kwargs): print (kwargs) print ('Start building Skip-gram + Hierarchical Softmax model on the coarsened graphs...') models = [] @@ -382,7 +387,7 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs): path_length = kwargs.get('path_length', 10) num_paths = kwargs.get('num_paths', 40) output = kwargs.get('output', 'default') - edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output) + edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output, workers=workers) # the coarest level if level == levels - 1: @@ -398,7 +403,7 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs): model.syn0norm = None model.corpus_count = len(edges) - cur_merged_nodes = [(node, merged_node) for node, merged_node in recursive_merged_nodes[level].iteritems() if node != merged_node] + cur_merged_nodes = [(node, merged_node) for node, merged_node in six.iteritems(recursive_merged_nodes[level]) if node != merged_node] cur_merged_nodes = sorted(cur_merged_nodes, key=operator.itemgetter(1)) changed_merged_nodes = [] @@ -436,7 +441,7 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs): print ('Finish building Skip-gram model on the coarsened graphs.') return models -def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, **kwargs): +def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, workers=_WORKERS, **kwargs): # print (kwargs) print ('Start building Skip-gram + Negative Sampling model on the coarsened graphs...') models = [] @@ -467,7 +472,7 @@ def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, **kwargs): path_length = kwargs.get('path_length', 10) num_paths = kwargs.get('num_paths', 40) output = kwargs.get('output', 'default') - edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output) + edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output, workers=workers) # use adjacency matrix elif scale == 1: edges, weights = recursive_graphs[level].get_edges() @@ -475,9 +480,9 @@ def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, **kwargs): # the coarest level if level == levels - 1: - model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=20) + model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=workers) else: - model = Word2Vec(None, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=20) + model = Word2Vec(None, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=workers) model.build_vocab(edges) #model.reset_weights() # Does not present in the latest gensim and there is no sense to reset weight in the non-trained model @@ -523,7 +528,7 @@ def __iter__(self): yield line.split() # return self - def next(self): + def __next__(self): try: result = next(self.fp_iter).split() except: @@ -535,9 +540,9 @@ def next(self): raise StopIteration return result -def build_deepwalk_corpus(G, num_paths, path_length, output, alpha=0): +def build_deepwalk_corpus(G, num_paths, path_length, output, alpha=0, workers=_WORKERS): walks_filebase = output + '.walks' walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=num_paths, path_length=path_length, alpha=alpha, rand=random.Random(random.randint(0, 2**31)), - num_workers=20) + num_workers=workers) return combine_files_iter(walk_files, G.number_of_nodes() * num_paths, path_length) diff --git a/src/harp.py b/src/harp.py index b05cdf1..c4ee850 100644 --- a/src/harp.py +++ b/src/harp.py @@ -55,7 +55,8 @@ def main(): print ('Underlying network embedding model: {}'.format(args.model)) if args.model == 'deepwalk': - embeddings = graph_coarsening.skipgram_coarsening_disconnected(G,scale=-1,iter_count=1, + embeddings = graph_coarsening.skipgram_coarsening_disconnected(G,workers=args.workers, + scale=-1,iter_count=1, sfdp_path=args.sfdp_path, num_paths=args.number_walks,path_length=args.walk_length, representation_size=args.representation_size,window_size=args.window_size, diff --git a/src/scoring.py b/src/scoring.py index 928ab39..666c566 100644 --- a/src/scoring.py +++ b/src/scoring.py @@ -17,7 +17,7 @@ from sklearn.preprocessing import MultiLabelBinarizer from collections import defaultdict -from gensim.models import Word2Vec +from gensim.models.word2vec import Word2Vec class TopKRanker(OneVsRestClassifier): def predict(self, X, top_k_list): diff --git a/src/skipgram.py b/src/skipgram.py index 4714d18..c2a0a9b 100644 --- a/src/skipgram.py +++ b/src/skipgram.py @@ -23,13 +23,15 @@ ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.models import Word2Vec +from gensim.models.word2vec import Word2Vec from gensim.models.word2vec import Vocab from six import iteritems, itervalues, string_types from six.moves import xrange from types import GeneratorType +from multiprocessing import cpu_count import random +_WORKERS = 1 + int(cpu_count() / 2) # At least half of the available logical cores logger = logging.getLogger(__name__) try: @@ -51,36 +53,45 @@ def __init__(self, sentences=None, **kwargs): kwargs["min_count"] = 0 kwargs["negative"] = 0 kwargs["sample"] = kwargs.get("sample", 1e-3) - kwargs["workers"] = kwargs.get("workers", 20) - super(self.__class__, self).__init__(sentences, **kwargs) + kwargs["workers"] = kwargs.get("workers", _WORKERS) + try: + super(self.__class__, self).__init__(sentences, **kwargs) + except TypeError as err: + stmp = sentences + logger.error('ERROR: {}, kwargs: {}, sentences ({}): {} ...'.format(err, kwargs, type(stmp), + None if stmp is None else next(stmp))) + raise # add a word as the child of current word in the coarser graph def add_word(self, word, parent_word, emb, cur_index): fake_vocab_size = int(1e7) - word_index = len(self.vocab) + word_index = len(self.wv.vocab) inner_node_index = word_index - 1 - parent_index = self.vocab[parent_word].index + parent_index = self.wv.vocab[parent_word].index # add in the left subtree if word != parent_word: - self.vocab[word] = Vocab(index=word_index, count=fake_vocab_size-word_index,sample_int=(2**32)) + self.wv.vocab[word] = Vocab(index=word_index, count=fake_vocab_size-word_index,sample_int=(2**32)) if emb is not None: - self.syn0[cur_index] = emb + self.wv.syn0[cur_index] = emb else: - self.syn0[cur_index] = self.syn0[parent_index] + self.wv.syn0[cur_index] = self.wv.syn0[parent_index] # the node in the coarsened graph serves as an inner node now - self.index2word.append(word) - self.vocab[word].code = array(list(self.vocab[parent_word].code) + [0], dtype=uint8) - self.vocab[word].point = array(list(self.vocab[parent_word].point) + [inner_node_index], dtype=uint32) + self.wv.index2word.append(word) + self.wv.vocab[word].code = array(list(self.wv.vocab[parent_word].code) + [0], dtype=uint8) + self.wv.vocab[word].point = array(list(self.wv.vocab[parent_word].point) + [inner_node_index], dtype=uint32) self.inner_node_index_map[parent_word] = inner_node_index else: if emb is not None: - self.syn0[parent_index] = emb - self.vocab[word].code = array(list(self.vocab[word].code) + [1], dtype=uint8) - self.vocab[word].point = array(list(self.vocab[word].point) + [self.inner_node_index_map[word]], dtype=uint32) + self.wv.syn0[parent_index] = emb + self.wv.vocab[word].code = array(list(self.wv.vocab[word].code) + [1], dtype=uint8) + self.wv.vocab[word].point = array(list(self.wv.vocab[word].point) + [self.inner_node_index_map[word]], dtype=uint32) def train(self, sentences, total_words=None, word_count=0, - total_examples=None, queue_factor=2, report_delay=0.1): + total_examples=None, queue_factor=2, report_delay=0.1, **kwargs): + # sentences=sentences, corpus_file=corpus_file, total_examples=self.corpus_count, + # total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, + # end_alpha=self.min_alpha, compute_loss=compute_loss """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) @@ -102,13 +113,13 @@ def train(self, sentences, total_words=None, word_count=0, logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s", - self.workers, len(self.vocab), self.layer1_size, self.sg, + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative) - if not self.vocab: + if not self.wv.vocab: raise RuntimeError("you must first build vocabulary before training the model") - if not hasattr(self, 'syn0'): - raise RuntimeError("you must first finalize vocabulary before training the model") + #if not hasattr(self, 'wv.syn0'): + # raise RuntimeError("you must first finalize vocabulary before training the model") if total_words is None and total_examples is None: if self.corpus_count: From 591b7fb8e3f0432df451d26919673bdd713158d7 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 5 Aug 2019 08:21:05 +0000 Subject: [PATCH 4/9] Temporary walk files cleaned up on completion --- src/graph_coarsening.py | 29 +++++++++++++++++++++-------- src/harp.py | 12 +++++++++--- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/graph_coarsening.py b/src/graph_coarsening.py index b5ac6ee..38df12f 100644 --- a/src/graph_coarsening.py +++ b/src/graph_coarsening.py @@ -250,7 +250,7 @@ def external_ec_coarsening(graph, sfdp_path, coarsening_scheme=2): return recursive_graphs, recursive_merged_nodes -def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_merged_nodes=None, workers=_WORKERS, **kwargs): +def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_merged_nodes=None, workers=_WORKERS, output='tmpbuf', **kwargs): print (kwargs) if graph.is_connected(): print ('Connected graph.') @@ -280,7 +280,7 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer if not subgraph.is_connected(): gc_single_model = baseline.skipgram_baseline(subgraph, - workers=workers, + workers=workers, output=output, scale=scale, num_paths=num_paths, path_length=path_length, @@ -300,7 +300,8 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer recursive_graphs, recursive_merged_nodes = external_ec_coarsening(subgraph, sfdp_path) iter_counts = [iter_count for _ in range(len(recursive_graphs))] if hs == 1: - gc_model = skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, workers=workers, + gc_model = skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, + workers=workers, output=output, scale=scale, iter=iter_counts, num_paths=num_paths, @@ -317,7 +318,8 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer sample=sample) else: print ('Training negative sampling model...') - gc_model = skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, workers=workers, + gc_model = skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, + workers=workers, output=output, scale=scale, iter=iter_counts, num_paths=num_paths, @@ -512,12 +514,15 @@ def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, workers=_W return models class combine_files_iter: + ctr = dict() + def __init__(self, file_list, length, path_length): - self.file_list = file_list + self.file_list = tuple(file_list) self.file_list_iter = iter(file_list) self.fp_iter = open(next(self.file_list_iter)) self.length = length self.path_length = path_length + combine_files_iter.ctr[self.file_list] = 1 + combine_files_iter.ctr.get(self.file_list, 0) def __len__(self): return self.length @@ -531,15 +536,23 @@ def __iter__(self): def __next__(self): try: result = next(self.fp_iter).split() - except: + except StopIteration: try: self.fp_iter.close() self.fp_iter = open(next(self.file_list_iter)) result = next(self.fp_iter).split() - except: - raise StopIteration + except StopIteration: + raise return result + def __del__(self): + # Remove the processed files + combine_files_iter.ctr[self.file_list] -= 1 + if not combine_files_iter.ctr[self.file_list]: + for fname in self.file_list: + os.remove(fname) + + def build_deepwalk_corpus(G, num_paths, path_length, output, alpha=0, workers=_WORKERS): walks_filebase = output + '.walks' walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=num_paths, diff --git a/src/harp.py b/src/harp.py index c4ee850..6af87fa 100644 --- a/src/harp.py +++ b/src/harp.py @@ -54,21 +54,27 @@ def main(): print ('Number of edges: {}'.format(G.number_of_edges())) print ('Underlying network embedding model: {}'.format(args.model)) + outpbase = os.path.splitext(args.output)[0] if args.model == 'deepwalk': - embeddings = graph_coarsening.skipgram_coarsening_disconnected(G,workers=args.workers, + embeddings = graph_coarsening.skipgram_coarsening_disconnected(G, + workers=args.workers,output=outpbase, scale=-1,iter_count=1, sfdp_path=args.sfdp_path, num_paths=args.number_walks,path_length=args.walk_length, representation_size=args.representation_size,window_size=args.window_size, lr_scheme='default',alpha=0.025,min_alpha=0.001,sg=1,hs=1,coarsening_scheme=2, sample=0.1) elif args.model == 'node2vec': - embeddings = graph_coarsening.skipgram_coarsening_disconnected(G,scale=-1,iter_count=1, + embeddings = graph_coarsening.skipgram_coarsening_disconnected(G, + workers=args.workers,output=outpbase, + scale=-1,iter_count=1, sfdp_path=args.sfdp_path, num_paths=args.number_walks,path_length=args.walk_length, representation_size=args.representation_size,window_size=args.window_size, lr_scheme='default',alpha=0.025,min_alpha=0.001,sg=1,hs=0,coarsening_scheme=2, sample=0.1) elif args.model == 'line': - embeddings = graph_coarsening.skipgram_coarsening_disconnected(G,scale=1, iter_count=50, + embeddings = graph_coarsening.skipgram_coarsening_disconnected(G, + workers=args.workers,output=outpbase, + scale=1, iter_count=50, sfdp_path=args.sfdp_path, representation_size=args.representation_size,window_size=1, lr_scheme='default',alpha=0.025,min_alpha=0.001,sg=1,hs=0,sample=0.001) From 87d41739c653dedf4e3e64316342bcbc18b294f8 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 5 Aug 2019 08:35:02 +0000 Subject: [PATCH 5/9] Workers described in the readme --- README.md | 3 +++ src/harp.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cd3a2a1..d6ab82d 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,9 @@ Note that ``node2vec`` uses the default parameters, which is p=1.0 and q=1.0. Path to the binary file of SFDP, which is the module we used for graph coarsening. You can set it to ``sfdp_linux``, ``sfdp_osx`` or ``sfdp_windows.exe`` depending on your operating system. +**--workers** *procs_num=cpu_num* +The number of parallel executors, equals to the number of logical CPUs by default. + **More options:** The full list of command line options is available with ``python src/harp.py --help``. # Evaluation diff --git a/src/harp.py b/src/harp.py index 6af87fa..247ab5d 100644 --- a/src/harp.py +++ b/src/harp.py @@ -8,6 +8,7 @@ from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter from magicgraph import WeightedDiGraph, WeightedNode from scipy.io import mmread, mmwrite, loadmat, savemat +from multiprocessing import cpu_count import graph_coarsening @@ -35,7 +36,7 @@ def main(): help='Length of the random walk started at each node.') parser.add_argument('--window-size', default=10, type=int, help='Window size of the Skip-gram model.') - parser.add_argument('--workers', default=1, type=int, + parser.add_argument('--workers', default=cpu_count(), type=int, help='Number of parallel processes.') args = parser.parse_args() From cb48e6827058b6ceb8a53f859d6a9e7e6e9cb291 Mon Sep 17 00:00:00 2001 From: Artem V L Date: Mon, 5 Aug 2019 13:26:53 +0200 Subject: [PATCH 6/9] Deepwalk installation adjusted to HARP described --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d6ab82d..e670661 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,20 @@ This code run with Python 2. The following Python packages are required to install HARP. -[magicgraph](https://github.com/phanein/magic-graph) is a library for processing graph data. +[Magicgraph](https://github.com/phanein/magic-graph) is a library for processing graph data. To install, run the following commands: git clone https://github.com/phanein/magic-graph.git cd magic-graph python setup.py install +[DeepWalk](https://github.com/eXascaleInfolab/deepwalk) is an embedding learning library for graphs. +To install, run the following commands: + + git clone https://github.com/eXascaleInfolab/deepwalk.git + cd deepwalk + python setup.py install + Then, install HARP and the other requirements: git clone https://github.com/GTmac/HARP.git From a82bf889ff9552084efab5b566701d039d06c42a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 8 Aug 2019 14:12:19 +0000 Subject: [PATCH 7/9] Case-insensitive extension made --- src/harp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harp.py b/src/harp.py index 247ab5d..3560f44 100644 --- a/src/harp.py +++ b/src/harp.py @@ -80,7 +80,7 @@ def main(): representation_size=args.representation_size,window_size=1, lr_scheme='default',alpha=0.025,min_alpha=0.001,sg=1,hs=0,sample=0.001) - if args.output.endswith('.mat'): + if args.output.lower().endswith('.mat'): savemat(args.output, mdict={'embs': embeddings}) else: np.save(args.output, embeddings) From 5290386c9a0109ef75a33c3d58b08401ecc6c867 Mon Sep 17 00:00:00 2001 From: Artem V L Date: Fri, 9 Aug 2019 06:30:54 +0200 Subject: [PATCH 8/9] Redundant space removed --- src/baseline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/baseline.py b/src/baseline.py index ed433e0..5b445db 100644 --- a/src/baseline.py +++ b/src/baseline.py @@ -1,4 +1,4 @@ -from gensim.models.word2vec import Word2Vec +from gensim.models.word2vec import Word2Vec from multiprocessing import cpu_count import graph_coarsening import numpy as np From 71c490c6d52c32716c28c4a909582ca2f6f28d04 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Aug 2019 14:12:06 +0000 Subject: [PATCH 9/9] Default workers number set to 1, strings formatting updated to the Py3 style --- requirements.txt | 2 +- src/graph_coarsening.py | 19 ++++++++++--------- src/harp.py | 13 +++++++++---- src/scoring.py | 4 ++-- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3a13de0..21a9d27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ argparse>=1.2.1 futures>=2.1.6 six>=1.7.3 psutil>=2.1.1 -deepwalk>=1.0.3 +#deepwalk>=1.0.3 # Note: the original DeepWalk 1.0.3 does not support numeric walk items, https://github.com/eXascaleInfolab/deepwalk provides this feature diff --git a/src/graph_coarsening.py b/src/graph_coarsening.py index 38df12f..8f136a3 100644 --- a/src/graph_coarsening.py +++ b/src/graph_coarsening.py @@ -1,3 +1,4 @@ +from __future__ import print_function import copy import glob import logging @@ -220,7 +221,7 @@ def external_ec_coarsening(graph, sfdp_path, coarsening_scheme=2): input_fname = os.path.join(temp_dir, temp_fname) mmwrite(open(os.path.join(input_fname), 'wb'), magicgraph.to_adjacency_matrix(graph)) sfdp_abs_path = os.path.abspath(sfdp_path) - subprocess.call('%s -g%d -v -u -Tc %s 2>x' % (sfdp_abs_path, coarsening_scheme, input_fname), shell=True, cwd=temp_dir) + subprocess.call('{:s} -g{:d} -v -u -Tc {:s} 2>x'.format(sfdp_abs_path, coarsening_scheme, input_fname), shell=True, cwd=temp_dir) recursive_graphs, recursive_merged_nodes = [], read_coarsening_info(temp_dir) subprocess.call(['rm', '-r', temp_dir]) cur_graph = graph @@ -233,15 +234,15 @@ def external_ec_coarsening(graph, sfdp_path, coarsening_scheme=2): for level in range(levels): if iter_round == 1: - print ('Original graph with %d nodes and %d edges' % \ - (cur_graph.number_of_nodes(), cur_graph.number_of_edges() ) ) + print ('Original graph with {:d} nodes and {:d} edges'.format( + cur_graph.number_of_nodes(), cur_graph.number_of_edges() ) ) recursive_graphs.append(DoubleWeightedDiGraph(cur_graph)) coarsened_graph = external_collapsing(cur_graph, recursive_merged_nodes[level]) cur_node_count = coarsened_graph.number_of_nodes() - print ('Coarsening Round %d:' % iter_round) - print ('Generate coarsened graph with %d nodes and %d edges' % \ - (coarsened_graph.number_of_nodes(), coarsened_graph.number_of_edges()) ) + print ('Coarsening Round {:d}:'.format(iter_round)) + print ('Generate coarsened graph with {:d} nodes and {:d} edges'.format( + coarsened_graph.number_of_nodes(), coarsened_graph.number_of_edges()) ) recursive_graphs.append(coarsened_graph) cur_graph = coarsened_graph @@ -276,7 +277,7 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer for subgraph, reversed_mapping in zip(subgraphs, reversed_mappings): count += 1 - print ('Subgraph %d with %d nodes and %d edges' % (count, subgraph.number_of_nodes(), subgraph.number_of_edges())) + print ('Subgraph {:d} with {:d} nodes and {:d} edges'.format(count, subgraph.number_of_nodes(), subgraph.number_of_edges())) if not subgraph.is_connected(): gc_single_model = baseline.skipgram_baseline(subgraph, @@ -381,7 +382,7 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, workers=_WO walks = kwargs['walks'] for level in range(levels - 1, -1, -1): - print ('Training on graph level %d...' % level) + print ('Training on graph level {:d}...'.format(level)) if scale == 1: edges, weights = recursive_graphs[level].get_edges() random.shuffle(edges) @@ -468,7 +469,7 @@ def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, workers=_W sample = kwargs.get('sample', 1e-3) for level in range(levels - 1, -1, -1): - print ('Training on graph level %d...' % level) + print ('Training on graph level {:d}...'.format(level)) # DeepWalk if scale == -1: path_length = kwargs.get('path_length', 10) diff --git a/src/harp.py b/src/harp.py index 3560f44..9d881f1 100644 --- a/src/harp.py +++ b/src/harp.py @@ -36,8 +36,8 @@ def main(): help='Length of the random walk started at each node.') parser.add_argument('--window-size', default=10, type=int, help='Window size of the Skip-gram model.') - parser.add_argument('--workers', default=cpu_count(), type=int, - help='Number of parallel processes.') + parser.add_argument('--workers', default=1, type=int, + help='Number of parallel processes, -1 to consume all available logical CPUs (harware threads).') args = parser.parse_args() # Process args @@ -48,8 +48,13 @@ def main(): elif args.format == 'edgelist': G = magicgraph.load_edgelist(args.input, undirected=True) else: - raise Exception("Unknown file format: '%s'. Valid formats: 'mat', 'adjlist', and 'edgelist'." - % args.format) + raise ValueError("Unknown file format: '{}'. Valid formats: 'mat', 'adjlist', and 'edgelist'.".format(args.format)) + if args.workers < 0 or args.workers > cpu_count(): + if args.workers == -1: + args.workers = cpu_count() + else: + raise ValueError("Invalid number of workers: {} / {} CPUs".format(args.workers, cpu_count())) + G = graph_coarsening.DoubleWeightedDiGraph(G) print ('Number of nodes: {}'.format(G.number_of_nodes())) print ('Number of edges: {}'.format(G.number_of_edges())) diff --git a/src/scoring.py b/src/scoring.py index 666c566..b8ee319 100644 --- a/src/scoring.py +++ b/src/scoring.py @@ -60,7 +60,7 @@ def scoring(embeddings, graph_file, training_percents, network_name="network", l all_results = defaultdict(list) for train_percent in training_percents: - print ('Training Percent: %.2f' % train_percent) + print ('Training Percent: {:.1%}'.format(train_percentq)) for shuf in shuffles: X, y = shuf @@ -121,4 +121,4 @@ def scoring(embeddings, graph_file, training_percents, network_name="network", l all_results, micro_f1, macro_f1 = scoring(args.embeddings, args.input, args.training_percents, network_name=args.adj_matrix_name, labels_name=args.label_name) for item in sorted(macro_f1): - print ('Training ratio is %.1f%%, macro F1 is %.2f%%' % (item * 100, macro_f1[item] * 100)) + print ('Training ratio is {:.1%}, macro F1 is {:.2%}'.format(item, macro_f1[item]))