Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sitemap_urls.dat
sitemap_layers.csv
sitemap_graph*
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,14 @@ The code can run in Python 2 or 3 and the external library dependencies are as f
Once you have Python, these libraries can most likely be installed on any operating system with the following terminal commands:

```
pip install requests
pip install beautifulsoup4
pip install pandas
pip install -r requirements.txt
```

The Graphviz library is more difficult to install. On Mac it can be done with the help of homebrew:
If you are on Mac make sure to brew graphviz before to launch the pip command:

```
brew install graphviz
pip install graphviz
pip install ir requirements.txt
```

For other operating systems or alternate methods, check out the [installation instructions in the Graphviz documentation](http://graphviz.readthedocs.io/en/latest/manual.html).
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
requests
beautifulsoup4
pandas
graphviz
28 changes: 17 additions & 11 deletions visualize_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
size = '8,5' # Size of rendered graph
output_format = 'pdf' # Format of rendered image - pdf,png,tiff
skip = '' # List of branches to restrict from expanding
only = '' # List of branches to restrict from expanding

# Import external library dependencies

Expand All @@ -48,6 +49,8 @@
help='Format of the graph you want to save. Allowed formats are jpg, png, pdf or tif')
parser.add_argument('--skip', type=str, default=skip,
help="List of branches that you do not want to expand. Comma separated: e.g. --skip 'news,events,datasets'")
parser.add_argument('--only', type=str, default=only,
help="List of branches that you want to visualize. Comma separated: e.g. --only 'news,events,datasets'")
args = parser.parse_args()


Expand All @@ -60,10 +63,11 @@
size = args.size
output_format = args.output_format
skip = args.skip.split(',')
only = args.only.split(',')

# Main script functions

def make_sitemap_graph(df, layers=graph_depth, limit=limit, size=size, output_format=output_format, skip=skip):
def make_sitemap_graph(df, layers=graph_depth, limit=limit, size=size, output_format=output_format, skip=skip, only=only):
''' Make a sitemap graph up to a specified layer depth.

sitemap_layers : DataFrame
Expand All @@ -82,6 +86,9 @@ def make_sitemap_graph(df, layers=graph_depth, limit=limit, size=size, output_fo

skip : list
List of branches that you do not want to expand.

only : list
List of branches that you want to visualize.
'''


Expand Down Expand Up @@ -123,7 +130,7 @@ def add_branch(f, names, vals, limit, connect_to=''):

f.attr('node', shape='oval') # Plot nodes as ovals
f.graph_attr.update()

# Loop over each layer adding nodes and edges to prior nodes
for i in range(1, layers+1):
cols = [str(i_) for i_ in range(i)]
Expand All @@ -138,15 +145,14 @@ def add_branch(f, names, vals, limit, connect_to=''):
# Select the data then count branch size, sort, and truncate
data = df[mask].groupby([str(i)])['counts'].sum()\
.reset_index().sort_values(['counts'], ascending=False)

# Add to the graph unless specified that we do not want to expand k-1
if (not skip) or (k[-1] not in skip):
add_branch(f,
names=data[str(i)].values,
vals=data['counts'].values,
limit=limit,
connect_to='-'.join(['%s']*i) % tuple(k))

if (skip == ['']) or (k[-1] not in skip):
if (only == ['']) or (k[-1] in only) or (i == 1):
add_branch(f,
names=data[str(i)].values,
vals=data['counts'].values,
limit=limit,
connect_to='-'.join(['%s']*i) % tuple(k))
print(('Built graph up to node %d / %d in layer %d' % (j, len(nodes), i))\
.ljust(50), end='\r')

Expand Down Expand Up @@ -240,7 +246,7 @@ def main():

print('Building %d layer deep sitemap graph' % graph_depth)
f = make_sitemap_graph(sitemap_layers, layers=graph_depth,
limit=limit, size=size, output_format=output_format, skip=skip)
limit=limit, size=size, output_format=output_format, skip=skip, only=only)
f = apply_style(f, style=style, title=title)

f.render(cleanup=True)
Expand Down