import scrapy
from sina.items import MovieItem
from scrapy_splash import SplashRequest
import time
import re
class SinaspiderSpider(scrapy.Spider):
name = 'sinaspider'
allowed_domains = ['ent.sina.com.cn']
start_urls = ['http://ent.sina.com.cn/ku/movie_search_index.d.html?page=1&cTime=1546971817&pre=next']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,args={'images': 0, 'timeout': 3})
def parse(self, response):
'''
1。获取文章列表页中的文章url并交给scrapy下载后并进行解析
2。获取下一页的url并交给scrapy,下载完成后交给parse
'''
for sel in response.css('ul.tv-list li'):
director = sel.css('.item-intro.left p:nth-child(3)::text').extract_first()
yield {'director': director}
href = response.css('.next-t.nextPage::attr(href)').extract_first()
if href:
t = str(int(time.time()*1000))
temp = re.match('.*page=(\d+).*', href)
p = int(temp.group(1))+1
url = 'http://ent.sina.com.cn/ku/movie_search_index.d.html?page='+str(p)+'&cTime='+t+'&pre=next'
yield SplashRequest(url, args={'images': 0, 'timeout': 3})