爬取斗鱼中的内容时,分析页面发现,页面跳转时,url不会发生改变,所以,我们可以通过Fiddler抓取真正的网址
import scrapy
import json
from Douyu.items import DouyuItem
class DouyuSpider(scrapy.Spider):
name = 'douyu'
allowed_domains = ['douyu.com']
offset = 0
url = 'https://www.douyu.com/gapi/rkc/directory/0_0/'
# 'https://www.douyu.com/directory/all/gapi/rkc/directory/0_0/' 这个是页面跳转的真实页面
"""直接json格式链接"""
"""https://www.douyu.com/gapi/rkc/directory/0_0/0"""
start_urls = ['https://www.douyu.com/gapi/rkc/directory/0_0/0'] #这个是json格式的
def parse(self, response):
json_text = json.loads(response.text)#将字符串格式的转化为json数据
total_data = len(json_text['data']['rl'])
for i in range(0,total_data):