这里作者就不讲其他文件的配置,直接上代码吧,如下:
# coding=utf-8
import scrapy
import re
import time
from cn56_net.items import Cn56NetItem
class WlzxSpider(scrapy.Spider):
name = 'wlzx'
allowed_domains = ['babasuper.com']
custom_settings = {
'LOG_LEVEL': 'DEBUG',
'LOG_FILE': 'wlzx_log_%s.txt' % time.time(), //日志的配置
"DEFAULT_REQUEST_HEADERS": {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
} //请求头的设置
}
start_urls = ['https://www.babasuper.com/news-2.html','https://www.babasuper.com/news-3.html','https://www.babasuper.com/news-5.html','https://www.babasuper.com/news-6.html']
def parse(self, response):
base_url = 'https://www.babasuper.com'
# 所有浏览量的获取
read_amount = response.xpath('//span[@class="list-span2"]//text()').extract()
# 获取每一页的路径
details_url = response.xpath('//ul[@class="list"]//li/a/@href').extract()
# 新闻的模块
column = response.xpath('//div[@class="baba-nav-text"]//a[3]/text()').extract_first()
if len(details_url):
i = 0
while i < len(details_url):
'''进行循环获取每一页的详情页面'''
full_details_url = base_url+details_url[i]
# 每个新闻的阅读量
each_read = read_amount[i]
each_read = re.findall(r'(\d+).*',each_read.replace(',',''))[0]
each_read = {
'read_amount':each_read,
'column':column,
}
i += 1
yield scrapy.Request(url=full_details_url,callback=self.details_infos,meta=each_read)
# 获取下一页
next_url = response.xpath('//a//em[contains(text(),"下一页")]/../@href').extract_first()
full_next_url = base_url+next_url
if next_url !='javascript:void(0);':
yield scrapy.Request(url=full_next_url,callback=self.parse)
def details_infos(self,response):
babawuliu = Cn56NetItem()
infos_meta = response.meta
read_amount = infos_meta['read_amount']
title = response.xpath('//h2//text()').extract_first()
babawuliu['title'] = title
infos = response.xpath('//p[@class="article-box-p"]//text()').extract_first()
if infos:
publish_time = re.findall(r'(\d+-\d+-\d+).*',infos)
publish_time = publish_time[0] if publish_time else ''
babawuliu['pub_time']=publish_time
source = re.findall(r'.*来源:(\w+)',infos)
source = source[0] if source else ''
babawuliu['source'] = source
content = response.xpath('//div[@class="article-con"]').extract_first()
link_url =response.url
update_time = time.strftime('%Y-%m-%d')
# 生成的对象
babawuliu['content'] = content
babawuliu['url'] = link_url
babawuliu['read_amount'] = read_amount
babawuliu['update_time']=update_time
babawuliu['web'] = '叭叭物流'
babawuliu['create_time'] = time.strftime('%Y-%m-%d')
babawuliu['column'] = infos_meta['column']
babawuliu['author'] = ''
babawuliu['forward_amount'] = 0
babawuliu['comment_amount'] = 0
yield babawuliu
简单的分享如下,如有需要完整的代码,可以留言加好友,这里提供作者完整的scrapy项目