在scrapy中怎么让Spider自动去抓取豆瓣小组页面
1,引入Scrapy中的另一个预定义的蜘蛛CrawlSpider
1
2
|
from
scrapy.contrib.spiders
import
CrawlSpider, Rule
from
scrapy.contrib.linkextractors.sgml
import
SgmlLinkExtractor
|
2, 基于CrawSpider定义一个新的类GroupSpider,并添加相应的爬行规则。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
class
GroupSpider(CrawlSpider):
name
=
"Group"
allowed_domains
=
[
"douban.com"
]
start_urls
=
[
"http://www.douban.com/group/explore?tag=%E8%B4%AD%E7%89%A9"
,
"http://www.douban.com/group/explore?tag=%E7%94%9F%E6%B4%BB"
,
"http://www.douban.com/group/explore?tag=%E7%A4%BE%E4%BC%9A"
,
"http://www.douban.com/group/explore?tag=%E8%89%BA%E6%9C%AF"
,
"http://www.douban.com/group/explore?tag=%E5%AD%A6%E6%9C%AF"
,
"http://www.douban.com/group/explore?tag=%E6%83%85%E6%84%9F"
,
"http://www.douban.com/group/explore?tag=%E9%97%B2%E8%81%8A"
,
"http://www.douban.com/group/explore?tag=%E5%85%B4%E8%B6%A3"
]
rules
=
[
Rule(SgmlLinkExtractor(allow
=
(
'/group/[^/]+/$'
, )), callback
=
'parse_group_home_page'
, process_request
=
'add_cookie'
),
Rule(SgmlLinkExtractor(allow
=
(
'/group/explore\?tag'
, )), follow
=
True
, process_request
=
'add_cookie'
),
]
|
rules定义是CrawlSpider中最重要的一环,可以理解为:当蜘蛛看到某种类型的网页,如何去进行处理。
例如,如下规则会处理URL以/group/XXXX/为后缀的网页,调用parse_group_home_page为处理函数,并且会在request发送前调用add_cookie来附加cookie信息。
1
|
Rule(SgmlLinkExtractor(allow
=
(
'/group/[^/]+/$'
, )), callback
=
'parse_group_home_page'
, process_request
=
'add_cookie'
),
|
1
|
Rule(SgmlLinkExtractor(allow
=
(
'/group/explore\?tag'
, )), follow
=
True
, process_request
=
'add_cookie'
),
|
如何添加Cookie
定义如下函数,并如前面所讲在Rule定义里添加process_request=add_cookie。
1
2
3
4
5
|
def
add_cookie(
self
, request):
request.replace(cookies
=
[
{
'name'
:
'COOKIE_NAME'
,
'value'
:
'VALUE'
,
'domain'
:
'.douban.com'
,
'path'
:
'/'
},
]);
return
request;
|
如何防止蜘蛛被网站Ban掉
首先可以尝试添加登陆用户的cookie去抓取网页,即使你抓取的是公开网页,添加cookie有可能会防止蜘蛛在应用程序层被禁。这个我没有实际验证过,但肯定没有坏处。
其次,即使你是授权用户,如果你的访问过于频繁,你的IP会可能被ban,所以一般你需要让蜘蛛在访问网址中间休息1~2秒。
还有就是配置User Agent,尽量轮换使用不同的UserAgent去抓取网页
在Scrapy项目的settings.py钟,添加如下设置:
1
2
3
4
|
DOWNLOAD_DELAY
=
2
RANDOMIZE_DOWNLOAD_DELAY
=
True
USER_AGENT
=
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED
=
True
|
================
到此位置,抓取豆瓣小组页面的蜘蛛就完成了。接下来,可以按照这种模式定义抓取小组讨论页面数据的Spider,然后就放手让蜘蛛去爬行吧!Have Fun!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
from
scrapy.contrib.spiders
import
CrawlSpider, Rule
from
scrapy.contrib.linkextractors.sgml
import
SgmlLinkExtractor
from
scrapy.selector
import
HtmlXPathSelector
from
scrapy.item
import
Item
from
douban.items
import
DoubanItem
import
re
class
GroupSpider(CrawlSpider):
name
=
"Group"
allowed_domains
=
[
"douban.com"
]
start_urls
=
[
"http://www.douban.com/group/explore?tag=%E8%B4%AD%E7%89%A9"
,
"http://www.douban.com/group/explore?tag=%E7%94%9F%E6%B4%BB"
,
"http://www.douban.com/group/explore?tag=%E7%A4%BE%E4%BC%9A"
,
"http://www.douban.com/group/explore?tag=%E8%89%BA%E6%9C%AF"
,
"http://www.douban.com/group/explore?tag=%E5%AD%A6%E6%9C%AF"
,
"http://www.douban.com/group/explore?tag=%E6%83%85%E6%84%9F"
,
"http://www.douban.com/group/explore?tag=%E9%97%B2%E8%81%8A"
,
"http://www.douban.com/group/explore?tag=%E5%85%B4%E8%B6%A3"
]
rules
=
[
Rule(SgmlLinkExtractor(allow
=
(
'/group/[^/]+/$'
, )), callback
=
'parse_group_home_page'
, process_request
=
'add_cookie'
),
# Rule(SgmlLinkExtractor(allow=('/group/[^/]+/discussion\?start\=(\d{1,4})$', )), callback='parse_group_topic_list', process_request='add_cookie'),
Rule(SgmlLinkExtractor(allow
=
(
'/group/explore\?tag'
, )), follow
=
True
, process_request
=
'add_cookie'
),
]
def
__get_id_from_group_url(
self
, url):
m
=
re.search(
"^http://www.douban.com/group/([^/]+)/$"
, url)
if
(m):
return
m.group(
1
)
else
:
return
0
def
add_cookie(
self
, request):
request.replace(cookies
=
[
]);
return
request;
def
parse_group_topic_list(
self
, response):
self
.log(
"Fetch group topic list page: %s"
%
response.url)
pass
def
parse_group_home_page(
self
, response):
self
.log(
"Fetch group home page: %s"
%
response.url)
hxs
=
HtmlXPathSelector(response)
item
=
DoubanItem()
#get group name
item[
'groupName'
]
=
hxs.select(
'//h1/text()'
).re(
"^\s+(.*)\s+$"
)[
0
]
#get group id
item[
'groupURL'
]
=
response.url
groupid
=
self
.__get_id_from_group_url(response.url)
#get group members number
members_url
=
"http://www.douban.com/group/%s/members"
%
groupid
members_text
=
hxs.select(
'//a[contains(@href, "%s")]/text()'
%
members_url).re(
"\((\d+)\)"
)
item[
'totalNumber'
]
=
members_text[
0
]
#get relative groups
item[
'RelativeGroups'
]
=
[]
groups
=
hxs.select(
'//div[contains(@class, "group-list-item")]'
)
for
group
in
groups:
url
=
group.select(
'div[contains(@class, "title")]/a/@href'
).extract()[
0
]
item[
'RelativeGroups'
].append(url)
#item['RelativeGroups'] = ','.join(relative_groups)
return
item<span><span style
=
"line-height:20px;"
> <
/
span><
/
span>
|