forked from nukeador/20up
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMyHTMLParser.py
More file actions
73 lines (61 loc) · 2.49 KB
/
MyHTMLParser.py
File metadata and controls
73 lines (61 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#! /usr/bin/python
# -*- coding: utf-8 -*-
"""
Copyright (C) 2013 Borja Menendez Moreno
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Authors: Borja Menéndez Moreno <tuentiup@gmail.com>
Program for the backup of Tuenti, a Spanish social network.
This program downloads all of the photos, comments, private messages and
friends' information of a specific user.
"""
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.fileToWrite = ''
self.writeData = False
self.writeUser = False
def setFile(self, fileName):
self.fileToWrite = open(fileName + '.txt', 'w')
def handle_starttag(self, tag, attrs):
if tag == 'small':
self.writeData = True
self.writeUser = False
for attr in attrs:
if attr[1].encode('utf-8').find('user_id') != -1:
self.writeData = False
self.writeUser = True
elif attr[1].encode('utf-8').find('box') != -1:
self.writeData = False
self.writeUser = False
elif attr[1].encode('utf-8') == 'time':
self.writeData = True
self.writeUser = False
def handle_endtag(self, tag):
if tag == 'small':
self.writeData = False
self.writeUser = False
elif tag == 'div':
self.writeData = False
self.writeUser = False
elif tag == 'a':
self.writedata = False
self.writeUser = False
elif tag == 'html':
self.fileToWrite.close()
def handle_data(self, data):
if self.writeData:
self.fileToWrite.write(data.encode('utf-8') + '\r\n')
elif self.writeUser:
self.fileToWrite.write('----------------------------------------\r\n')
self.fileToWrite.write(data.encode('utf-8') + ':\r\n')