创建一个新的项目
$ scrapy startproject jianshu
编辑需要爬取的参数
class JianshuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 标题
desc = scrapy.Field() # 描述
img = scrapy.Field() # 图片
author = scrapy.Field() # 作者
like_nums = scrapy.Field() # 喜欢数量
jsb = scrapy.Field() # 简书币
ds = scrapy.Field() # 打赏
settings文件中配置user-agent等请求头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'cache-control': 'max-age=0',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
编写爬取代码
class JianShu(scrapy.Spider):
name = 'jianshu'
allowed_domains = ['jianshu.com'] # 限定爬取的范围
start_urls = [
'https://www.jianshu.com/'
]
def parse(self, response):
sel = scrapy.selector.Selector(response)
sites = sel.xpath('//div[@id="list-container"]/ul/li')
item = JianshuItem()
title_list = [] # 标题
desc_list = [] # 描述
img_list = [] # 图片
author_list = [] # 作者
like_nums_list = [] # 喜欢数量
jsb_list = [] # 简书币数量
ds_list = [] # 打赏
for site in sites:
title_list.append((site.xpath('div[@class="content"]/a[@class="title"]/text()').extract()))
desc_list.append((site.xpath('div[@class="content"]/p[@class="abstract"]/text()').extract()))
img_list.append((site.xpath('a[@class="wrap-img"]/img/@src').extract()))
author_list.append(
site.xpath('div[@class="content"]/div[@class="meta"]/a[@class="nickname"]/text()').extract())
jsb_list.append(
(site.xpath('div[@class="content"]/div[@class="meta"]/span/text()').extract())[1])
like_nums_list.append(
(site.xpath('div[@class="content"]/div[@class="meta"]/span/text()').extract())[2])
try:
ds_list.append(
(site.xpath('div[@class="content"]/div[@class="meta"]/span/text()').extract())[3])
except Exception as reason:
print(reason)
ds_list.append([])
item['img'] = img_list
item['title'] = title_list
item['desc'] = desc_list
item['author'] = author_list
item['like_nums'] = like_nums_list
item['jsb'] = jsb_list
item['ds'] = ds_list
return item
运行,保存数据为json并存储到本地
scrapy crawl jianshu -o jian_shu_home.json -t json
图片显示