方法一:用页面循环去做
抛弃start_urls 通过调用自带方法start_Request实现功能
# -*- coding: utf-8 -*-
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
# allowed_domains = ['ffffffff']
# start_urls = ['http://www.quanshuwang.com/list/4_1.html']
def start_requests(self):
for page in range(1, 1332):
yield scrapy.Request(url='http://www.quanshuwang.com/list/4_{}.html'.format(page))
def get_content(self, response):
content_details = ''
details = response.xpath('//div[@id="waa"]/text()').extract()
items = response.meta['items']
for content in details:
content_details += content.__str__().replace('\n', '').replace(' ', '').replace('\xa0', '')
items['content'] = content_details
yield items
def parse(self, response):
# for page in range(2, 3):
title_list = response.xpath('//ul[@class="seeWell cf"]/li/span[@class="l"]/a/@title').extract()
# print(title_list)
author_list = response.xpath('//span[@class="l"]/a[2]/text()').extract()
# print(author_list)
image_url_list = response.xpath('//a[@class="l mr10"]/img/@src').extract()
# print(image_url_list)
content_url_list = response.xpath('//a[@class="readTo"]/@href').extract()
# print(content_url)
for title, author, image_url, content_url in zip(title_list, author_list, image_url_list, content_url_list):
items = {}
items['title'] = title
items['author'] = author
items['image_url'] = image_url
yield scrapy.Request(url=content_url, callback=self.get_content, meta={'items': items})
# yield scrapy.Request(url='http://www.quanshuwang.com/list/4_{}.html'.format(page))
# next_url = response.xpath('//a[@class="next"]/@href').extract()
# print(next_url)
# if next_url:
# yield scrapy.Request(url=next_url[0])
方法二:通过page进行循环
# -*- coding: utf-8 -*-
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
# allowed_domains = ['ffffffff']
start_urls = ['http://www.quanshuwang.com/list/4_1.html']
# def start_requests(self):
# for page in range(1, 1332):
# yield scrapy.Request(url='http://www.quanshuwang.com/list/4_{}.html'.format(page))
def get_content(self, response):
content_details = ''
details = response.xpath('//div[@id="waa"]/text()').extract()
items = response.meta['items']
for content in details:
content_details += content.__str__().replace('\n', '').replace(' ', '').replace('\xa0', '')
items['content'] = content_details
yield items
def parse(self, response):
for page in range(2, 3):
title_list = response.xpath('//ul[@class="seeWell cf"]/li/span[@class="l"]/a/@title').extract()
# print(title_list)
author_list = response.xpath('//span[@class="l"]/a[2]/text()').extract()
# print(author_list)
image_url_list = response.xpath('//a[@class="l mr10"]/img/@src').extract()
# print(image_url_list)
content_url_list = response.xpath('//a[@class="readTo"]/@href').extract()
# print(content_url)
for title, author, image_url, content_url in zip(title_list, author_list, image_url_list, content_url_list):
items = {}
items['title'] = title
items['author'] = author
items['image_url'] = image_url
yield scrapy.Request(url=content_url, callback=self.get_content, meta={'items': items})
yield scrapy.Request(url='http://www.quanshuwang.com/list/4_{}.html'.format(page))
# next_url = response.xpath('//a[@class="next"]/@href').extract()
# print(next_url)
# if next_url:
# yield scrapy.Request(url=next_url[0])
方法三:在每个页面有自己的跳转键(跳转到下一个页面)
# -*- coding: utf-8 -*-
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
# allowed_domains = ['ffffffff']
start_urls = ['http://www.quanshuwang.com/list/4_1.html']
# def start_requests(self):
# for page in range(1, 1332):
# yield scrapy.Request(url='http://www.quanshuwang.com/list/4_{}.html'.format(page))
def get_content(self, response):
content_details = ''
details = response.xpath('//div[@id="waa"]/text()').extract()
items = response.meta['items']
for content in details:
content_details += content.__str__().replace('\n', '').replace(' ', '').replace('\xa0', '')
items['content'] = content_details
yield items
def parse(self, response):
# for page in range(2, 3):
title_list = response.xpath('//ul[@class="seeWell cf"]/li/span[@class="l"]/a/@title').extract()
# print(title_list)
author_list = response.xpath('//span[@class="l"]/a[2]/text()').extract()
# print(author_list)
image_url_list = response.xpath('//a[@class="l mr10"]/img/@src').extract()
# print(image_url_list)
content_url_list = response.xpath('//a[@class="readTo"]/@href').extract()
# print(content_url)
for title, author, image_url, content_url in zip(title_list, author_list, image_url_list, content_url_list):
items = {}
items['title'] = title
items['author'] = author
items['image_url'] = image_url
yield scrapy.Request(url=content_url, callback=self.get_content, meta={'items': items})
# yield scrapy.Request(url='http://www.quanshuwang.com/list/4_{}.html'.format(page))
next_url = response.xpath('//a[@class="next"]/@href').extract()
print(next_url)
if next_url:
yield scrapy.Request(url=next_url[0])
源码链接
链接:https://pan.baidu.com/s/1X7Tnyvd9wq8G1BhSNQC8ag 密码:oz8h