diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1777d96 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +sitemap_urls.dat +sitemap_layers.csv +sitemap_graph* \ No newline at end of file diff --git a/README.md b/README.md index 1607e9b..4484fa5 100644 --- a/README.md +++ b/README.md @@ -81,16 +81,14 @@ The code can run in Python 2 or 3 and the external library dependencies are as f Once you have Python, these libraries can most likely be installed on any operating system with the following terminal commands: ``` -pip install requests -pip install beautifulsoup4 -pip install pandas +pip install -r requirements.txt ``` -The Graphviz library is more difficult to install. On Mac it can be done with the help of homebrew: +If you are on Mac make sure to brew graphviz before to launch the pip command: ``` brew install graphviz -pip install graphviz +pip install ir requirements.txt ``` For other operating systems or alternate methods, check out the [installation instructions in the Graphviz documentation](http://graphviz.readthedocs.io/en/latest/manual.html). diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8d75786 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +requests +beautifulsoup4 +pandas +graphviz diff --git a/visualize_urls.py b/visualize_urls.py index 9fa7ca3..4943340 100644 --- a/visualize_urls.py +++ b/visualize_urls.py @@ -27,6 +27,7 @@ size = '8,5' # Size of rendered graph output_format = 'pdf' # Format of rendered image - pdf,png,tiff skip = '' # List of branches to restrict from expanding +only = '' # List of branches to restrict from expanding # Import external library dependencies @@ -48,6 +49,8 @@ help='Format of the graph you want to save. Allowed formats are jpg, png, pdf or tif') parser.add_argument('--skip', type=str, default=skip, help="List of branches that you do not want to expand. Comma separated: e.g. --skip 'news,events,datasets'") +parser.add_argument('--only', type=str, default=only, + help="List of branches that you want to visualize. Comma separated: e.g. --only 'news,events,datasets'") args = parser.parse_args() @@ -60,10 +63,11 @@ size = args.size output_format = args.output_format skip = args.skip.split(',') +only = args.only.split(',') # Main script functions -def make_sitemap_graph(df, layers=graph_depth, limit=limit, size=size, output_format=output_format, skip=skip): +def make_sitemap_graph(df, layers=graph_depth, limit=limit, size=size, output_format=output_format, skip=skip, only=only): ''' Make a sitemap graph up to a specified layer depth. sitemap_layers : DataFrame @@ -82,6 +86,9 @@ def make_sitemap_graph(df, layers=graph_depth, limit=limit, size=size, output_fo skip : list List of branches that you do not want to expand. + + only : list + List of branches that you want to visualize. ''' @@ -123,7 +130,7 @@ def add_branch(f, names, vals, limit, connect_to=''): f.attr('node', shape='oval') # Plot nodes as ovals f.graph_attr.update() - + # Loop over each layer adding nodes and edges to prior nodes for i in range(1, layers+1): cols = [str(i_) for i_ in range(i)] @@ -138,15 +145,14 @@ def add_branch(f, names, vals, limit, connect_to=''): # Select the data then count branch size, sort, and truncate data = df[mask].groupby([str(i)])['counts'].sum()\ .reset_index().sort_values(['counts'], ascending=False) - # Add to the graph unless specified that we do not want to expand k-1 - if (not skip) or (k[-1] not in skip): - add_branch(f, - names=data[str(i)].values, - vals=data['counts'].values, - limit=limit, - connect_to='-'.join(['%s']*i) % tuple(k)) - + if (skip == ['']) or (k[-1] not in skip): + if (only == ['']) or (k[-1] in only) or (i == 1): + add_branch(f, + names=data[str(i)].values, + vals=data['counts'].values, + limit=limit, + connect_to='-'.join(['%s']*i) % tuple(k)) print(('Built graph up to node %d / %d in layer %d' % (j, len(nodes), i))\ .ljust(50), end='\r') @@ -240,7 +246,7 @@ def main(): print('Building %d layer deep sitemap graph' % graph_depth) f = make_sitemap_graph(sitemap_layers, layers=graph_depth, - limit=limit, size=size, output_format=output_format, skip=skip) + limit=limit, size=size, output_format=output_format, skip=skip, only=only) f = apply_style(f, style=style, title=title) f.render(cleanup=True)