scrapy框架使用
安装
使用
scrapy startproject csdn
- 创建爬虫,进入项目目录执行
scrapy genspider csdn_spider csdn.net
- 编辑item.py文件,添加需要获取的字段属性
import scrapy
class CsdnItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
href = scrapy.Field()
- 编辑爬虫文件csdn_spider.py
import scrapy
from csdn.items import CsdnItem
class CsdnSpiderSpider(scrapy.Spider):
# 爬虫名字
name = 'csdn_spider'
# 允许域
allowed_domains = ['csdn.net']
# 爬取的网址
start_urls = ['http://blog.csdn.net/nav/news']
def parse(self, response):
# 先获取页面中class="list_con"的div标签,在获取class="title oneline"的子div标签
body = response.xpath('//div[@class="list_con"]//div[@class="title oneline"]')
for value in body:
item = CsdnItem()
try:
item['title'] = value.xpath('./h2/a/text()')[0].extract().strip()
item['href'] = value.xpath('./h2/a/@href')[0].extract()
except Exception as e:
print(e)
else:
print(item['title'] + ':' + item['href'])
yield item
- 保存爬取的内容,编辑setting.py文件
取消注释:
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'csdn.pipelines.CsdnPipeline': 300,
#}
变成:
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'csdn.pipelines.CsdnPipeline': 300,
}
之后编辑pipelines.py文件
class CsdnPipeline(object):
def process_item(self, item, spider):
with open('csdn.txt', 'a') as fp:
fp.write('{0} {1}\n'.format(item['title'], item['href']))
- 添加启动脚本
from scrapy.cmdline import execute
# 启动debug模式
execute('scrapy crawl csdn_spider --nolog'.split())