1、创建工程
scrapy startproject sinablog
2、修改items.py
import scrapy
class SinablogItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
article_name = scrapy.Field()
article_url = scrapy.Field()
3、修改pipelines.py
import json
import codecs
class SinablogPipeline(object):
def __init__(self):
self.file = codecs.open('sinablog_data.json', mode='wb', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + '\n'
self.file.write(line.decode("unicode_escape"))
return item
4、修改settings.py
BOT_NAME = 'sinablog'
SPIDER_MODULES = ['sinablog.spiders']
NEWSPIDER_MODULE = 'sinablog.spiders'
#禁止cookies,防止被ban
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'sinablog.pipelines.SinablogPipeline':300
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'sinablog (+http://www.yourdomain.com)'
5、新建sinablog_spider.py
from scrapy.spider import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from sinablog.items import SinablogItem
class SinablogSpider(Spider):
"""爬虫SinablogSpider"""
name = "sinablog"
#减慢爬取速度 为1s
download_delay = 1
allowed_domains = ["blog.sina.com.cn"]
start_urls = [
#第一篇文章地址
"http://blog.sina.com.cn/s/blog_65db99840100h2jj.html"
]
def parse(self, response):
sel = Selector(response)
#items = []
#获得文章url和标题
item = SinablogItem()
article_url = str(response.url)
article_name = sel.xpath('//h2[@class="titName SG_txta"]/text()').extract()
item['article_name'] = [n.encode('utf-8') for n in article_name]
item['article_url'] = article_url.encode('utf-8')
yield item
#获得下一篇文章的url
urls = sel.xpath('//div[@class="articalfrontback SG_j_linedot1 clearfix"]/div/a/@href').extract()
for url in urls:
#print url
#url = "http://blog.csdn.net" + url
print url
yield Request(url, callback=self.parse)
6、运行爬虫
scrapy crawl sinablog
执行结果如下:
{"article_name": ["欢迎您在新浪博客安家"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100h2jj.html"}
{"article_name": ["WIN7系统有杂音"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100jpb8.html"}
{"article_name": ["近视手术,想好再做"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100jphn.html"}
{"article_name": ["欢迎您在新浪博客安家"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100h2jj.html"}
{"article_name": ["wxToolBar::Realize()"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100jwav.html"}
{"article_name": ["指针类型转换---小问题害死人"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100jyha.html"}
{"article_name": ["Windows下OverSim和PeerSim的安装"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100k287.html"}
{"article_name": ["C语言读取文件一行以及换行符的问题"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kguj.html"}
{"article_name": ["C语言写一行及写换行符的问题"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100khaf.html"}
{"article_name": ["文本模式读写文件中\r和\n的问题"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kidc.html"}
{"article_name": ["string和char*"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kie9.html"}
{"article_name": ["标准输入输出流是二进制流"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kiem.html"}
{"article_name": ["extern C"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kjr2.html"}
{"article_name": ["opera unite"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kjsd.html"}
{"article_name": ["error C2146: 语法错误 : 缺少“;”"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kkiy.html"}
{"article_name": ["ip地址什么时候要用16位char数组存放"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kkni.html"}
{"article_name": ["LNK2019 MeridianClient.def"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kl2n.html"}
{"article_name": ["无法定位xxx于动态链接库xxx.dll上"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kl5k.html"}
{"article_name": ["关山派出所办证大厅工作时间"], "article_url": "http://blog.sina.com.cn/s/blog_65db99840100kl9b.html"}