-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebpage2json.py
More file actions
executable file
·50 lines (42 loc) · 1.73 KB
/
webpage2json.py
File metadata and controls
executable file
·50 lines (42 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#! /usr/bin/env python3
''' Summarize webpage
Read the webpage http://www-01.ibm.com/support/docview.wss?uid=nas4PSPbyNum,
and create a shortened version of the table found there.
Output is JSON containing a list of dictionaries, each dictionary
containing entries for release, title, group, level, and updated date.
Output is to stdout, so that it can be fed into another program, or
saved to a file. The JSON is formatted for easy (human) reading.
'''
import sys
import requests
import json
from bs4 import BeautifulSoup
if len( sys.argv ) > 1: # If provided, accept a file on the command line.
with open( sys.argv[ 1 ], "r" ) as f:
text = f.read()
# Parse and break down the HTML found on the page
soup = BeautifulSoup( text, features="lxml" )
else:
# retrieve the web page
page = requests.get( "http://www-01.ibm.com/support/docview.wss?uid=nas4PSPbyNum" )
if page.status_code != 200:
print("Could not fetch webpage. Status code = {}".format(page.status_code))
exit(1)
# Parse and break down the HTML found on the page
soup = BeautifulSoup( page.text, features="lxml" )
# Find the division which contains the table.
div = soup.find_all("div", { "class" : "dblue-table-container" } )
table = div[0].find_all( "tr" )
collected = list()
for trow in table:
if len(trow) > 0:
tdr = trow.find_all("td")
if len(tdr) > 0:
row = dict()
row["release"] = tdr[0].text
row["title"] = tdr[1].text
row["group"] = tdr[2].text
row["level"] = tdr[3].text
row["updated"] = tdr[4].text
collected.append(row)
print(json.dumps(collected, indent=2))