-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNLPJob.py
More file actions
105 lines (101 loc) · 3.49 KB
/
NLPJob.py
File metadata and controls
105 lines (101 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
__author__ = "SakuraYUI"
# -*- coding:utf-8 -*-
import urllib
import urllib2
#def __init__(self):
# self.suffix = ["nlp", "machine-learning", "data-mining", "search-enging", "recommend-system", "compute-ad", "big-data", "others"]
# self.user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
# self.headers = {"User-Agent" : self.user_agent}
# self.enable = False
def getPage():
suffix = ["nlp", "machine-learning", "data-mining", "search-enging", "recommend-system", "compute-ad", "big-data", "others"]
for index in suffix:
print 'starting to deal with:' + index + '...'
getPageIndex(index)
def getPageIndex(index):
try:
pageNum = 1
while True:
url = "http://www.nlpjob.com/jobs/" + index + "/shixi/?p=" + bytes(pageNum)
UrlNext = "http://www.nlpjob.com/jobs/" + index + "/shixi/?p=" + bytes(pageNum+1)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
pageCode = response.read().decode('utf-8')
getPageItem(pageCode)
pageNum = pageNum + 1
if outOfPage(UrlNext, pageCode):
break
#return pageCode
except urllib2.URLError, e:
if hasattr(e, "reason"):
print "Connecting failed",e.reason
return None
def outOfPage(url, pageCode):
strUrl = 'class="current_page" href="' + url
offset = pageCode.find(strUrl)
if offset != -1:
return False
else:
return True
def getPageItem(pageCode):
itemList = []
offset = pageCode.find('<div class="row">')
while True:
itemUrl = ""
offset = pageCode.find('< a href="http://www.nlpjob.com/job/', offset)
if offset == -1:
break
offset += 9
while pageCode[offset] != '"':
itemUrl += pageCode[offset]
offset = offset + 1
#print itemUrl
itemList.append(itemUrl)
#return itemList
getItem(itemList)
def getItem(itemList):
f = open('/home/lzy/workspace/pythonbug/res.txt','w')
dictArr = []
for itemUrl in itemList:
request = urllib2.Request(itemUrl)
response = urllib2.urlopen(request)
page = response.read().decode('utf-8')
index = ''
title = ''
company = ''
city = ''
offset = page.find('class="selected"')
offset = page.find('<span>', offset)
offset += 6
while page[offset] != '<':
index += page[offset]
offset = offset + 1
offset = page.find('<h2>')
offset = offset + 10
offset = page.find('>', offset)
offset = offset + 1
while page[offset] != '<':
title += page[offset]
offset = offset + 1
offset = page.find('<strong>', offset)
offset += 8
while page[offset] != '<':
company += page[offset]
offset = offset + 1
offset = page.find('<strong>', offset)
offset += 8
while page[offset] != '<':
city += page[offset]
offset = offset + 1
print index.strip()
print title.strip()
print company.strip()
print city.strip()
d = {}
d['index'] = index.strip()
d['title'] = title.strip()
d['company'] = company.strip()
d['city'] = city.strip()
dictArr.append(d)
f.wirte(dictArr)
f.close()