创建一个项目,该项目代码:
import scrapy
from selenium import webdriver
from wangyiyunPro.items import WangyiyunproItem #调用item模块
class WangyiyunSpider(scrapy.Spider):
name = 'wangyiyun'
# allowed_domains = ['www.xxx.com'] #可允许通过的域名
start_urls = ['https://news.163.com/']
models_urls = [] #首页静态爬取的url放到这个列表中
def __init__(self):
self.bro = webdriver.Chrome('chromedriver.exe') #将selenium中谷歌浏览器导入类中 ,
def parse(self, response):
li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
alist = [3,4,6,7] #调用,只爬取列表中指定索引url
for index in alist:
model_url = li_list[index].xpath('./a/@href').extract_first() #将url解析下来
self.models_urls.append(model_url) #存到列表中
for url in self.models_urls:
yield scrapy.Request(url,callback=self.parse_model) #将爬取下来的指定url发起请求,并且回调给parse_model函数
def parse_model(self,response):
div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div[1]/div/ul/li/div/div')
for div in div_list:
name = div.xpath('./div/div[1]/h3/a/text()').extract_first() #将标题解析下来