创建scrapy工程
1
2
|
cd C:\Spider_dev\app\scrapyprojects
scrapy startproject renren
|
创建定向爬虫
1
2
|
cd renren
scrapy genspider Person renren.com
|
查看目录结构
定义items
1
2
3
4
5
6
|
class
RenrenItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
sex = scrapy.Field() # 性别
birthday = scrapy.Field() # 生日
addr = scrapy.Field() # 家乡
|
编写爬虫
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# -*- coding: gbk -*-
import scrapy
# 导入items中的数据项定义模块
from
renren.items import RenrenItem
class
PersonSpider(scrapy.Spider):
name =
"Person"
allowed_domains = [
'renren.com'
]
start_urls = [
'http://www.renren.com/913043576/profile?v=info_timeline'
]
def start_requests(self):
return
[scrapy.FormRequest(
'http://www.renren.com/PLogin.do'
,
formdata={
'email'
:
'15201417639'
,
'password'
:
'kongzhagen.com'
},
callback=self.login)]
def login(self,response):
for
url
in
self.start_urls:
yield self.make_requests_from_url(url)
def parse(self, response):
item = RenrenItem()
basicInfo = response.xpath(
'//div[@id="basicInfo"]'
)
sex = basicInfo.xpath(
'div[2]/dl[1]/dd/text()'
).extract()[0]
birthday = basicInfo.xpath(
'div[2]/dl[2]/dd/a/text()'
).extract()
birthday =
''
.
join
(birthday)
addr = basicInfo.xpath(
'div[2]/dl[3]/dd/text()'
).extract()[0]
item[
'sex'
] = sex
item[
'addr'
] = addr
item[
'birthday'
] =birthday
return
item
|
解释:
allowed_domains:定义允许访问的域名
start_urls:登陆人人网后访问的URL
start_requests:程序的开始函数,FormRequest定义了scrapy如何post提交数据,返回函数或迭代器,回调函数login。
login:登陆人人网之后的爬虫处理函数,make_requests_from_url处理start_urls中的url,其默认的回调函数为parse
parse:处理make_requests_from_url函数返回的结果
执行爬虫
1
|
scrapy crawl Person -o person.csv
|