-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinfomatch.py
More file actions
97 lines (81 loc) · 3.22 KB
/
infomatch.py
File metadata and controls
97 lines (81 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#coding=utf-8
# Name:
# Purpose:
# Author: libin
# Created:
#-----------------------------------------------------------------------------------------------------------------------
import re
import urllib2
class infoMatcher():
"负责从网页流中找出关键词"
info = []
def __init__(self,web,id):
self.id =id
self.web = web
self.info = [self.id]
def record_otherReasons(self,web_record,id_record):
path_record = "D:\\other_reasons\\"+str(id_record)+".txt"
open(path_record,"w").write(web_record)
def filter_closed(self):
filter0 = re.compile("您查找的用户ID有误或不存在,请尝试其他方式搜索",re.DOTALL)
content0 = filter0.findall(self.web)
self.content0 = "".join(content0)
def filter(self):
filter1 = re.compile("查看详细信息>></a(.*?)婚",re.DOTALL)
content = filter1.findall(self.web)
self.content = "".join(content)
#print(self.content)
def getsex(self):
filter_sex = re.compile(">(.*?),",re.DOTALL)
sex = filter_sex.findall(self.content)
self.sex = "".join(sex)
#self.sex=self.sex.decode("utf-8")
def getage(self):
filter_age = re.compile(",([0-9][0-9])岁",re.DOTALL)
age = filter_age.findall(self.content)
self.age = int("".join(age))
#print(self.age)
def getxinzuo(self):
filter_xinzuo = re.compile("岁,(.*?),",re.DOTALL)
xinzuo = filter_xinzuo.findall(self.content)
self.xinzuo = "".join(xinzuo)
def getlocation(self):
filter_location = re.compile("来自(.*?)<",re.DOTALL)
location = filter_location.findall(self.content)
self.location = "".join(location)
def getheight(self):
filter_height = re.compile("身高:</b>(.*?)厘米",re.DOTALL)
height = filter_height.findall(self.content)
self.height = int("".join(height))
def getdegree(self):
filter_degree = re.compile("学历:</b>(.*?)<",re.DOTALL)
degree = filter_degree.findall(self.content)
self.degree = "".join(degree)
def domatch(self):
self.filter()
if self.content:
self.getsex()
self.getage()
self.getxinzuo()
self.getlocation()
self.getheight()
self.getdegree()
# 将中文的bytes decode 成str存入info中,方便存入数据库和xls文件
self.info+=[self.sex.decode("utf-8"),self.age,self.xinzuo.decode("utf-8"),self.location.decode("utf-8"),self.height,self.degree.decode("utf-8")]
else:
self.filter_closed()
if self.content0:
self.info+=["closed"]
else:
self.info+=["other reasons"]
self.record_otherReasons(self.web,self.id)
return self.info
def main():
i = 110
# url = "http://www.jiayuan.com/"+str(i)
# request = urllib2.Request(url)
# web_get = urllib2.urlopen(request).read()
r=infoMatcher("欺诈行为",i).domatch()
print(r)
if __name__ == '__main__':
main()