完整的scrapy流程
1、编写配置
ITEM_PIPELINES = {
'test1.pipelines.Test1Pipeline': 300,
}
2、编写itmes
class ItcastItem(scrapy.Item):
# define the fields for your item here like:
title= scrapy.Field()
name=scrapy.Field()
tag1=scrapy.Field()
tag2=scrapy.Field()
3、编写spider
class Itcastspider(scrapy.Spider):
name = "itcast" # 爬虫的识别名称
allowed_domains = ['lab.scrapyd.cn/', ] # 爬虫的域名范围,只爬取这个域名下的网页
start_urls = {
'http://lab.scrapyd.cn/page/1/',
}
# 解析相应内容,返回解析的结构化数据
def parse(self, response):
# filename = "teacher.html"
# open(filename, 'wb').write(response.body)
i = 1
items = []
# item = ItcastItem()
ls_list=response.xpath("//div[@class='col-mb-12 col-8']/div")
# title_list = response.xpath("//div[@class='col-mb-12 col-8']/div[@class='quote post']/span[1]")
print(len(ls_list))
i=1
for ls in ls_list:
item=ItcastItem()
print('开始爬取第{}条记录----------'.format(i))
#作者
name=ls.xpath(".//span[2]/small/text()").extract()[0]
title=ls.xpath(".//span[1]/text()").extract()[0]
tag1=ls.xpath(".//a[1]/text()").extract()[0]
tag2=ls.xpath(".//a[2]/text()").extract()[0]
item["name"]=name
item["title"]=title
item["tag1"]=tag1
item["tag2"]=tag2
items.append(item)
i+=1
return items
4、编写pipline
import json
class Test1Pipeline(object):
def open_spider(self, spider): # 在爬虫开启的时候仅执行一次
if spider.name == 'itcast':
self.f = open('json.txt', 'a', encoding='utf-8')
def close_spider(self, spider): # 在爬虫关闭的时候仅执行一次
if spider.name == 'itcast':
self.f.close()
def process_item(self, item, spider):
#想输出真正的中文 ensure_ascii=False
if spider.name == 'itcast':
self.f.write(json.dumps(dict(item), ensure_ascii=False, indent=2) + ',\n')
# 不return的情况下,另一个权重较低的pipeline将不会获得item
return item
5、结果