forked from xharaken/step2
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwikipedia.py
More file actions
138 lines (115 loc) · 4.38 KB
/
wikipedia.py
File metadata and controls
138 lines (115 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import sys
import collections
class Wikipedia:
# Initialize the graph of pages.
def __init__(self, pages_file, links_file):
# A mapping from a page ID (integer) to the page title.
# For example, self.titles[1234] returns the title of the page whose
# ID is 1234.
self.titles = {}
# A set of page links.
# For example, self.links[1234] returns an array of page IDs linked
# from the page whose ID is 1234.
self.links = {}
# Read the pages file into self.titles.
with open(pages_file) as file:
for line in file:
(id, title) = line.rstrip().split(" ")
id = int(id)
assert not id in self.titles, id
self.titles[id] = title
self.links[id] = []
print("Finished reading %s" % pages_file)
# Read the links file into self.links.
with open(links_file) as file:
for line in file:
(src, dst) = line.rstrip().split(" ")
(src, dst) = (int(src), int(dst))
assert src in self.titles, src
assert dst in self.titles, dst
self.links[src].append(dst)
print("Finished reading %s" % links_file)
print()
# Example: Find the longest titles.
def find_longest_titles(self):
titles = sorted(self.titles.values(), key=len, reverse=True)
print("The longest titles are:")
count = 0
index = 0
while count < 15 and index < len(titles):
if titles[index].find("_") == -1:
print(titles[index])
count += 1
index += 1
print()
# Example: Find the most linked pages.
def find_most_linked_pages(self):
link_count = {}
for id in self.titles.keys():
link_count[id] = 0
for id in self.titles.keys():
for dst in self.links[id]:
link_count[dst] += 1
print("The most linked pages are:")
link_count_max = max(link_count.values())
for dst in link_count.keys():
if link_count[dst] == link_count_max:
print(self.titles[dst], link_count_max)
print()
# Homework #1: Find the shortest path.
# 'start': A title of the start page.
# 'goal': A title of the goal page.
def find_shortest_path(self, start, goal):
#------------------------#
# Write your code here! #
#------------------------#
pass
# Homework #2: Calculate the page ranks and print the most popular pages.
def find_most_popular_pages(self):
#------------------------#
# Write your code here! #
#------------------------#
pass
# Homework #3 (optional):
# Search the longest path with heuristics.
# 'start': A title of the start page.
# 'goal': A title of the goal page.
def find_longest_path(self, start, goal):
#------------------------#
# Write your code here! #
#------------------------#
pass
# Helper function for Homework #3:
# Please use this function to check if the found path is well formed.
# 'path': An array of page IDs that stores the found path.
# path[0] is the start page. path[-1] is the goal page.
# path[0] -> path[1] -> ... -> path[-1] is the path from the start
# page to the goal page.
# 'start': A title of the start page.
# 'goal': A title of the goal page.
def assert_path(self, path, start, goal):
assert(start != goal)
assert(len(path) >= 2)
assert(self.titles[path[0]] == start)
assert(self.titles[path[-1]] == goal)
for i in range(len(path) - 1):
assert(path[i + 1] in self.links[path[i]])
visited = {}
for node in path:
assert(node not in visited)
visited[node] = True
if __name__ == "__main__":
if len(sys.argv) != 3:
print("usage: %s pages_file links_file" % sys.argv[0])
exit(1)
wikipedia = Wikipedia(sys.argv[1], sys.argv[2])
# Example
wikipedia.find_longest_titles()
# Example
wikipedia.find_most_linked_pages()
# Homework #1
wikipedia.find_shortest_path("渋谷", "パレートの法則")
# Homework #2
wikipedia.find_most_popular_pages()
# Homework #3 (optional)
wikipedia.find_longest_path("渋谷", "池袋")