Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,20 @@ This code run with Python 2.

The following Python packages are required to install HARP.

[magicgraph](https://github.com/phanein/magic-graph) is a library for processing graph data.
[Magicgraph](https://github.com/phanein/magic-graph) is a library for processing graph data.
To install, run the following commands:

git clone https://github.com/phanein/magic-graph.git
cd magic-graph
python setup.py install

[DeepWalk](https://github.com/eXascaleInfolab/deepwalk) is an embedding learning library for graphs.
To install, run the following commands:

git clone https://github.com/eXascaleInfolab/deepwalk.git
cd deepwalk
python setup.py install

Then, install HARP and the other requirements:

git clone https://github.com/GTmac/HARP.git
Expand Down Expand Up @@ -68,6 +75,9 @@ Note that ``node2vec`` uses the default parameters, which is p=1.0 and q=1.0.
Path to the binary file of SFDP, which is the module we used for graph coarsening.
You can set it to ``sfdp_linux``, ``sfdp_osx`` or ``sfdp_windows.exe`` depending on your operating system.

**--workers** *procs_num=cpu_num*
The number of parallel executors, equals to the number of logical CPUs by default.

**More options:** The full list of command line options is available with ``python src/harp.py --help``.

# Evaluation
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
gensim==0.13.1
scipy==0.19.1
gensim>=1.0.0
scipy>=0.19.1
wheel>=0.23.0
Cython>=0.20.2
argparse>=1.2.1
futures>=2.1.6
six>=1.7.3
psutil>=2.1.1
deepwalk<=1.0.2
#deepwalk>=1.0.3 # Note: the original DeepWalk 1.0.3 does not support numeric walk items, https://github.com/eXascaleInfolab/deepwalk provides this feature
10 changes: 7 additions & 3 deletions src/baseline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from gensim.models import Word2Vec
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import graph_coarsening
import numpy as np

_WORKERS = 1 + int(cpu_count() / 2) # At least half of the available logical cores

def skipgram_baseline(graph, **kwargs):
scale = kwargs.get('scale', -1)
representation_size = kwargs.get('representation_size', 128)
Expand All @@ -14,12 +17,13 @@ def skipgram_baseline(graph, **kwargs):
output = kwargs.get('output', 'default')
edges = graph_coarsening.build_deepwalk_corpus(graph, num_paths, path_length, output)

workers=kwargs.get('workers', _WORKERS)
if kwargs['hs'] == 0:
print ('Training the Negative Sampling Model...')
model = Word2Vec(edges, size=representation_size, window=kwargs['window_size'], min_count=0, sg=1, hs=0, iter=kwargs['iter_count'], negative=5, workers=20)
model = Word2Vec(edges, size=representation_size, window=kwargs['window_size'], min_count=0, sg=1, hs=0, iter=kwargs['iter_count'], negative=5, workers=workers)
else:
print ('Training the Hierarchical Softmax Model...')
model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sg=1, hs=1, iter=kwargs['iter_count'], workers=20)
model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sg=1, hs=1, iter=kwargs['iter_count'], workers=workers)

print ('Finish training the Skip-gram model.')
return model
113 changes: 66 additions & 47 deletions src/graph_coarsening.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import print_function
import copy
import glob
import logging
Expand All @@ -13,16 +14,20 @@
import baseline
import utils
import numpy as np
import six

from collections import defaultdict, deque
from concurrent.futures import ProcessPoolExecutor
from deepwalk import walks as serialized_walks
from gensim.models import Word2Vec
from gensim.models.word2vec import Word2Vec
from magicgraph import WeightedDiGraph, WeightedNode
from scipy.io import mmread, mmwrite
from multiprocessing import cpu_count

_WORKERS = 1 + int(cpu_count() / 2) # At least half of the available logical cores

class DoubleWeightedDiGraph(WeightedDiGraph):
def __init__(self, init_graph = None):
def __init__(self, init_graph=None):
super(WeightedDiGraph, self).__init__(node_class=WeightedNode)
self.weighted_nodes = magicgraph.WeightedNode()
if init_graph is not None:
Expand Down Expand Up @@ -216,7 +221,7 @@ def external_ec_coarsening(graph, sfdp_path, coarsening_scheme=2):
input_fname = os.path.join(temp_dir, temp_fname)
mmwrite(open(os.path.join(input_fname), 'wb'), magicgraph.to_adjacency_matrix(graph))
sfdp_abs_path = os.path.abspath(sfdp_path)
subprocess.call('%s -g%d -v -u -Tc %s 2>x' % (sfdp_abs_path, coarsening_scheme, input_fname), shell=True, cwd=temp_dir)
subprocess.call('{:s} -g{:d} -v -u -Tc {:s} 2>x'.format(sfdp_abs_path, coarsening_scheme, input_fname), shell=True, cwd=temp_dir)
recursive_graphs, recursive_merged_nodes = [], read_coarsening_info(temp_dir)
subprocess.call(['rm', '-r', temp_dir])
cur_graph = graph
Expand All @@ -229,15 +234,15 @@ def external_ec_coarsening(graph, sfdp_path, coarsening_scheme=2):

for level in range(levels):
if iter_round == 1:
print ('Original graph with %d nodes and %d edges' % \
(cur_graph.number_of_nodes(), cur_graph.number_of_edges() ) )
print ('Original graph with {:d} nodes and {:d} edges'.format(
cur_graph.number_of_nodes(), cur_graph.number_of_edges() ) )
recursive_graphs.append(DoubleWeightedDiGraph(cur_graph))

coarsened_graph = external_collapsing(cur_graph, recursive_merged_nodes[level])
cur_node_count = coarsened_graph.number_of_nodes()
print ('Coarsening Round %d:' % iter_round)
print ('Generate coarsened graph with %d nodes and %d edges' % \
(coarsened_graph.number_of_nodes(), coarsened_graph.number_of_edges()) )
print ('Coarsening Round {:d}:'.format(iter_round))
print ('Generate coarsened graph with {:d} nodes and {:d} edges'.format(
coarsened_graph.number_of_nodes(), coarsened_graph.number_of_edges()) )

recursive_graphs.append(coarsened_graph)
cur_graph = coarsened_graph
Expand All @@ -246,7 +251,7 @@ def external_ec_coarsening(graph, sfdp_path, coarsening_scheme=2):

return recursive_graphs, recursive_merged_nodes

def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_merged_nodes=None, **kwargs):
def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_merged_nodes=None, workers=_WORKERS, output='tmpbuf', **kwargs):
print (kwargs)
if graph.is_connected():
print ('Connected graph.')
Expand All @@ -272,10 +277,11 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer

for subgraph, reversed_mapping in zip(subgraphs, reversed_mappings):
count += 1
print ('Subgraph %d with %d nodes and %d edges' % (count, subgraph.number_of_nodes(), subgraph.number_of_edges()))
print ('Subgraph {:d} with {:d} nodes and {:d} edges'.format(count, subgraph.number_of_nodes(), subgraph.number_of_edges()))

if not subgraph.is_connected():
gc_single_model = baseline.skipgram_baseline(subgraph,
workers=workers, output=output,
scale=scale,
num_paths=num_paths,
path_length=path_length,
Expand All @@ -296,6 +302,7 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer
iter_counts = [iter_count for _ in range(len(recursive_graphs))]
if hs == 1:
gc_model = skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes,
workers=workers, output=output,
scale=scale,
iter=iter_counts,
num_paths=num_paths,
Expand All @@ -313,6 +320,7 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer
else:
print ('Training negative sampling model...')
gc_model = skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes,
workers=workers, output=output,
scale=scale,
iter=iter_counts,
num_paths=num_paths,
Expand All @@ -328,8 +336,8 @@ def skipgram_coarsening_disconnected(graph, recursive_graphs=None, recursive_mer
sg=1,
hs=0)

for ind, vec in enumerate(gc_model[-1].syn0):
real_ind = reversed_mapping[int(gc_model[-1].index2word[ind])]
for ind, vec in enumerate(gc_model[-1].wv.syn0):
real_ind = reversed_mapping[int(gc_model[-1].wv.index2word[ind])]
embeddings[real_ind] = vec
return embeddings

Expand All @@ -342,7 +350,7 @@ def gen_alpha(init_alpha, recursive_graphs, iter_counts):
alpha_list.append(init_alpha * 1. * cur_iter_count / total_iter_count)
return alpha_list

def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs):
def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, workers=_WORKERS, **kwargs):
print (kwargs)
print ('Start building Skip-gram + Hierarchical Softmax model on the coarsened graphs...')
models = []
Expand Down Expand Up @@ -374,15 +382,15 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs):
walks = kwargs['walks']

for level in range(levels - 1, -1, -1):
print ('Training on graph level %d...' % level)
print ('Training on graph level {:d}...'.format(level))
if scale == 1:
edges, weights = recursive_graphs[level].get_edges()
random.shuffle(edges)
elif scale == -1:
path_length = kwargs.get('path_length', 10)
num_paths = kwargs.get('num_paths', 40)
output = kwargs.get('output', 'default')
edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output)
edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output, workers=workers)

# the coarest level
if level == levels - 1:
Expand All @@ -391,14 +399,14 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs):
model = skipgram.Word2Vec_hs_loss(None, sg=kwargs['sg'], size=kwargs['representation_size'], iter=kwargs['iter'][level], window=kwargs['window_size'], sample=sample, alpha=alpha_list[level], min_alpha=min_alpha_list[level])

# copy vocab / index2word from the coarser graph
model.vocab = copy.deepcopy(models[-1].vocab)
model.index2word = copy.deepcopy(models[-1].index2word)
model.syn0 = copy.deepcopy(models[-1].syn0)
model.syn0.resize(recursive_graphs[level].number_of_nodes(), kwargs['representation_size'])
model.wv.vocab = copy.deepcopy(models[-1].wv.vocab)
model.wv.index2word = copy.deepcopy(models[-1].wv.index2word)
model.wv.syn0 = copy.deepcopy(models[-1].wv.syn0)
model.wv.syn0.resize(recursive_graphs[level].number_of_nodes(), kwargs['representation_size'])
model.syn0norm = None
model.corpus_count = len(edges)

cur_merged_nodes = [(node, merged_node) for node, merged_node in recursive_merged_nodes[level].iteritems() if node != merged_node]
cur_merged_nodes = [(node, merged_node) for node, merged_node in six.iteritems(recursive_merged_nodes[level]) if node != merged_node]
cur_merged_nodes = sorted(cur_merged_nodes, key=operator.itemgetter(1))

changed_merged_nodes = []
Expand All @@ -414,29 +422,29 @@ def skipgram_coarsening_hs(recursive_graphs, recursive_merged_nodes, **kwargs):
node_pool = [node, merged_node]
prev_node = node

cur_index = len(models[-1].vocab)
cur_index = len(models[-1].wv.vocab)
for node, merged_node in changed_merged_nodes:
if node == merged_node:
continue
str_node, str_merged_node = str(node), str(merged_node)
word_index = model.vocab[str_merged_node].index
init_vec = model.syn0[word_index]
word_index = model.wv.vocab[str_merged_node].index
init_vec = model.wv.syn0[word_index]
model.add_word(str_node, str_merged_node, init_vec, cur_index)
cur_index += 1
model.add_word(str_merged_node, str_merged_node, init_vec, cur_index)

model.syn1 = np.zeros((len(model.vocab), model.layer1_size), dtype=np.float32)
model.syn1 = np.zeros((len(model.wv.vocab), model.layer1_size), dtype=np.float32)
for i in range(len(models[-1].syn1)):
model.syn1[i] = models[-1].syn1[i]
model.syn0_lockf = np.ones(len(model.vocab), dtype=np.float32)
model.train(edges)
model.syn0_lockf = np.ones(len(model.wv.vocab), dtype=np.float32)
model.train(edges, total_examples=model.corpus_count, epochs=model.iter)

models.append(model)

print ('Finish building Skip-gram model on the coarsened graphs.')
return models

def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, **kwargs):
def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, workers=_WORKERS, **kwargs):
# print (kwargs)
print ('Start building Skip-gram + Negative Sampling model on the coarsened graphs...')
models = []
Expand All @@ -461,58 +469,61 @@ def skipgram_coarsening_neg(recursive_graphs, recursive_merged_nodes, **kwargs):
sample = kwargs.get('sample', 1e-3)

for level in range(levels - 1, -1, -1):
print ('Training on graph level %d...' % level)
print ('Training on graph level {:d}...'.format(level))
# DeepWalk
if scale == -1:
path_length = kwargs.get('path_length', 10)
num_paths = kwargs.get('num_paths', 40)
output = kwargs.get('output', 'default')
edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output)
edges = build_deepwalk_corpus(recursive_graphs[level], num_paths, path_length, output, workers=workers)
# use adjacency matrix
elif scale == 1:
edges, weights = recursive_graphs[level].get_edges()
random.shuffle(edges)

# the coarest level
if level == levels - 1:
model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=20)
model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=workers)
else:
model = Word2Vec(None, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=20)
model = Word2Vec(None, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sample=sample, sg=1, hs=0, iter=kwargs['iter'][level], workers=workers)
model.build_vocab(edges)
model.reset_weights()
#model.reset_weights() # Does not present in the latest gensim and there is no sense to reset weight in the non-trained model

# init model weights with the previous one
prev_syn0 = {models[-1].index2word[ind]: vec for ind, vec in enumerate(models[-1].syn0)}
prev_syn1neg = {models[-1].index2word[ind]: vec for ind, vec in enumerate(models[-1].syn1neg)}
word2index = {model.index2word[ind]: ind for ind in range(recursive_graphs[level].number_of_nodes())}
prev_syn0 = {models[-1].wv.index2word[ind]: vec for ind, vec in enumerate(models[-1].wv.syn0)}
prev_syn1neg = {models[-1].wv.index2word[ind]: vec for ind, vec in enumerate(models[-1].syn1neg)}
word2index = {model.wv.index2word[ind]: ind for ind in range(recursive_graphs[level].number_of_nodes())}
for ind in range(recursive_graphs[level].number_of_nodes()):
word = model.index2word[ind]
word = model.wv.index2word[ind]
if word in prev_syn0:
model.syn0[ind] = prev_syn0[word]
model.wv.syn0[ind] = prev_syn0[word]
model.syn1neg[ind] = prev_syn1neg[word]
else:
# if a is merged into b, then a should has identical weights in word2vec as b
if int(word) in recursive_merged_nodes[level]:
word_ind = word2index[word]
merged_word = str(recursive_merged_nodes[level][int(word)])
model.syn0[word_ind] = prev_syn0[merged_word]
model.wv.syn0[word_ind] = prev_syn0[merged_word]
model.syn1neg[word_ind] = prev_syn1neg[merged_word]
model.syn0_lockf = np.ones(len(model.vocab), dtype=np.float32)
model.syn0_lockf = np.ones(len(model.wv.vocab), dtype=np.float32)

model.train(edges)
model.train(edges, total_examples=model.corpus_count, epochs=model.iter)

models.append(model)

print ('Finish building Skip-gram model on the coarsened graphs.')
return models

class combine_files_iter:
ctr = dict()

def __init__(self, file_list, length, path_length):
self.file_list = file_list
self.file_list = tuple(file_list)
self.file_list_iter = iter(file_list)
self.fp_iter = open(next(self.file_list_iter))
self.length = length
self.path_length = path_length
combine_files_iter.ctr[self.file_list] = 1 + combine_files_iter.ctr.get(self.file_list, 0)

def __len__(self):
return self.length
Expand All @@ -523,21 +534,29 @@ def __iter__(self):
yield line.split()
# return self

def next(self):
def __next__(self):
try:
result = next(self.fp_iter).split()
except:
except StopIteration:
try:
self.fp_iter.close()
self.fp_iter = open(next(self.file_list_iter))
result = next(self.fp_iter).split()
except:
raise StopIteration
except StopIteration:
raise
return result

def build_deepwalk_corpus(G, num_paths, path_length, output, alpha=0):
def __del__(self):
# Remove the processed files
combine_files_iter.ctr[self.file_list] -= 1
if not combine_files_iter.ctr[self.file_list]:
for fname in self.file_list:
os.remove(fname)


def build_deepwalk_corpus(G, num_paths, path_length, output, alpha=0, workers=_WORKERS):
walks_filebase = output + '.walks'
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=num_paths,
path_length=path_length, alpha=alpha, rand=random.Random(random.randint(0, 2**31)),
num_workers=20)
num_workers=workers)
return combine_files_iter(walk_files, G.number_of_nodes() * num_paths, path_length)
Loading