diff --git a/deepwalk/__main__.py b/deepwalk/__main__.py index c331aef..13c3a56 100644 --- a/deepwalk/__main__.py +++ b/deepwalk/__main__.py @@ -88,7 +88,7 @@ def process(args): size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) - model.save_word2vec_format(args.output) + model.save_word2vec_format(args.output, binary=args.binary) def main(): @@ -143,6 +143,9 @@ def main(): parser.add_argument('--workers', default=1, type=int, help='Number of parallel processes.') + parser.add_argument('--binary', action='store_true', + help='Save the resulting vectors in binary moded; default is False (off)') + args = parser.parse_args() numeric_level = getattr(logging, args.log.upper(), None) diff --git a/deepwalk/graph.py b/deepwalk/graph.py index ef1f2d7..29fefbb 100644 --- a/deepwalk/graph.py +++ b/deepwalk/graph.py @@ -18,6 +18,7 @@ from itertools import product,permutations from scipy.io import loadmat from scipy.sparse import issparse +from gensim.utils import smart_open from concurrent.futures import ProcessPoolExecutor @@ -33,7 +34,7 @@ LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" class Graph(defaultdict): - """Efficient basic implementation of nx `Graph' – Undirected graphs with self loops""" + """Efficient basic implementation of nx `Graph' – Undirected graphs with self loops""" def __init__(self): super(Graph, self).__init__(list) @@ -45,22 +46,22 @@ def adjacency_iter(self): def subgraph(self, nodes={}): subgraph = Graph() - + for n in nodes: if n in self: subgraph[n] = [x for x in self[n] if x in nodes] - + return subgraph def make_undirected(self): - + t0 = time() for v in self.keys(): for other in self[v]: if v != other: self[other].append(v) - + t1 = time() logger.info('make_directed: added missing edges {}s'.format(t1-t0)) @@ -71,7 +72,7 @@ def make_consistent(self): t0 = time() for k in iterkeys(self): self[k] = list(sorted(set(self[k]))) - + t1 = time() logger.info('make_consistent: made consistent in {}s'.format(t1-t0)) @@ -85,10 +86,10 @@ def remove_self_loops(self): t0 = time() for x in self: - if x in self[x]: + if x in self[x]: self[x].remove(x) removed += 1 - + t1 = time() logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0))) @@ -99,7 +100,7 @@ def check_self_loops(self): for y in self[x]: if x == y: return True - + return False def has_edge(self, v1, v2): @@ -115,7 +116,7 @@ def degree(self, nodes=None): def order(self): "Returns the number of nodes in the graph" - return len(self) + return len(self) def number_of_edges(self): "Returns the number of nodes in the graph" @@ -157,12 +158,12 @@ def build_deepwalk_corpus(G, num_paths, path_length, alpha=0, walks = [] nodes = list(G.nodes()) - + for cnt in range(num_paths): rand.shuffle(nodes) for node in nodes: walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node)) - + return walks def build_deepwalk_corpus_iter(G, num_paths, path_length, alpha=0, @@ -194,7 +195,7 @@ def parse_adjacencylist(f): row = [introw[0]] row.extend(set(sorted(introw[1:]))) adjlist.extend([row]) - + return adjlist def parse_adjacencylist_unchecked(f): @@ -202,7 +203,7 @@ def parse_adjacencylist_unchecked(f): for l in f: if l and l[0] != "#": adjlist.extend([[int(x) for x in l.strip().split()]]) - + return adjlist def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True): @@ -218,13 +219,13 @@ def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True) t0 = time() - with open(file_) as f: + with smart_open(file_) as f: with ProcessPoolExecutor(max_workers=cpu_count()) as executor: - total = 0 + total = 0 for idx, adj_chunk in enumerate(executor.map(parse_func, grouper(int(chunksize), f))): adjlist.extend(adj_chunk) total += len(adj_chunk) - + t1 = time() logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0)) @@ -241,12 +242,12 @@ def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True) t1 = time() logger.info('Made graph undirected in {}s'.format(t1-t0)) - return G + return G def load_edgelist(file_, undirected=True): G = Graph() - with open(file_) as f: + with smart_open(file_) as f: for l in f: x, y = l.strip().split()[:2] x = int(x) @@ -254,7 +255,7 @@ def load_edgelist(file_, undirected=True): G[x].append(y) if undirected: G[y].append(x) - + G.make_consistent() return G @@ -298,7 +299,7 @@ def from_numpy(x, undirected=True): def from_adjlist(adjlist): G = Graph() - + for row in adjlist: node = row[0] neighbors = row[1:] @@ -309,7 +310,7 @@ def from_adjlist(adjlist): def from_adjlist_unchecked(adjlist): G = Graph() - + for row in adjlist: node = row[0] neighbors = row[1:]