-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLinkSpider.py
More file actions
40 lines (35 loc) · 1.48 KB
/
LinkSpider.py
File metadata and controls
40 lines (35 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding: utf-8 -*-
import scrapy
from tld import get_tld
from linksCrawl.items import LinkscrawlItem
from linksCrawl import settings
from urlparse import urlparse
class LinkspiderSpider(scrapy.Spider):
name = "linkSpider"
#allowed_domains = ["msn.com"]
start_urls = settings.START_URLS
def parse(self, response):
domain = get_tld(response.url)
items = []
url_result = urlparse(response.url)
top_domain = url_result.scheme + '://'+url_result.netloc
for sel in response.xpath('//a/@href'):
item = LinkscrawlItem()
link = sel.extract()
if link.find("http://") == 0 or link.find("https://") == 0 or link.find("www.") == 0:
try:
target_domain = get_tld(link)
#print domain +"==================="+target_domain +"==================" + link
if domain != target_domain:
item['link'] = link
item['source'] = top_domain
yield item
#items.append(item)
else:
yield scrapy.Request(link,callback=self.parse)
except:
print "The url can't get the domain. Ignored..." + link
if link.startswith('/'):
yield scrapy.Request(top_domain+link, callback=self.parse)
#print "items="+str(items)
#return items