1.Python爬虫实战一之爬取糗事百科段子
(http://cuiqingcai.com/990.html)
2.在工作目录创建myproject
scrapy startproject myproject
3.编写/myproject/myproject/items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
#Item是需要的数据进行格式化,方便后期处理
class MyItem(scrapy.Item):
user = scrapy.Field()
content = scrapy.Field()
godComment = scrapy.Field()
4.编写/myproject/myproject/spiders/MySpider.py
# -*- coding:utf-8 -*-
import scrapy
import re
from myproject.items import MyItem
#Spider是指定URL,发送请求和接收原始数据,再根据Item进行数据操作
class MySpider(scrapy.Spider):
name = 'myspider'
#可传入pageIndex参数合成完整URL
def __init__(self, pageIndex=None, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.start_urls = ['http://www.qiushibaike.com/hot/page/%s' % pageIndex]
#根据Item进行数据操作
def parse(self, response):
#print response.body.decode('response.encoding') #打印原始数据
pattern = re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>' +
'.*?' +
'<div class="content">.*?<span>(.*?)</span>.*?</div>' +
'.*?' +
'<div class="main-text">(.*?)<div class="likenum">'
,re.S)
items = re.findall(pattern,response.body.decode(response.encoding))
print ("lin len: %d"%(len(items)))
for item in items:
print ("lin User: %s"%(item[0].strip()))
print ("lin Content: %s"%(item[1].strip()))
print ("lin God comments: %s"%(item[2].strip()))
myItems = MyItem(user=item[0], content=item[1], godComment=item[2])
yield myItems
5.设置/myproject/myproject/settings.py的headers
DEFAULT_REQUEST_HEADERS = {
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
6.执行myspider,传入pageIndex=1,结果数据保存为items.json
scrapy crawl myspider -a pageIndex=1 -o items.json
7.结果输出和items.json出现Export Unicode字符集问题
8.Scrapy中关于Export Unicode字符集问题解决
(http://blog.csdn.net/peihaozhu/article/details/53022236)
8.1设置/myproject/myproject/settings.py
from scrapy.exporters import JsonLinesItemExporter
class CustomJsonLinesItemExporter(JsonLinesItemExporter):
def __init__(self, file, **kwargs):
super(CustomJsonLinesItemExporter, self).__init__(file, ensure_ascii=False, **kwargs)
#这里只需要将超类的ensure_ascii属性设置为False即可
#同时要在setting文件中启用新的Exporter类
FEED_EXPORTERS = {
'json': 'myproject.settings.CustomJsonLinesItemExporter',
}
8.2再次执行,解决items.json出现Export Unicode字符集的问题,items.json路径为\myproject