scrapy-巴巴物流网站信息采集。

这里作者就不讲其他文件的配置,直接上代码吧,如下:

# coding=utf-8
import scrapy
import re
import time
from cn56_net.items import Cn56NetItem

class WlzxSpider(scrapy.Spider):
    name = 'wlzx'
    allowed_domains = ['babasuper.com']
    custom_settings = {
        'LOG_LEVEL': 'DEBUG',
        'LOG_FILE': 'wlzx_log_%s.txt' % time.time(),      //日志的配置
        "DEFAULT_REQUEST_HEADERS": {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        }     //请求头的设置
    }
    start_urls = ['https://www.babasuper.com/news-2.html','https://www.babasuper.com/news-3.html','https://www.babasuper.com/news-5.html','https://www.babasuper.com/news-6.html']

    def parse(self, response):
        base_url = 'https://www.babasuper.com'
        # 所有浏览量的获取
        read_amount = response.xpath('//span[@class="list-span2"]//text()').extract()
        # 获取每一页的路径
        details_url = response.xpath('//ul[@class="list"]//li/a/@href').extract()
        # 新闻的模块
        column = response.xpath('//div[@class="baba-nav-text"]//a[3]/text()').extract_first()
        if len(details_url):
            i = 0
            while i < len(details_url):
                '''进行循环获取每一页的详情页面'''
                full_details_url = base_url+details_url[i]
                # 每个新闻的阅读量
                each_read = read_amount[i]
                each_read = re.findall(r'(\d+).*',each_read.replace(',',''))[0]
                each_read = {
                    'read_amount':each_read,
                    'column':column,
                    }
                i += 1
                yield scrapy.Request(url=full_details_url,callback=self.details_infos,meta=each_read)
        # 获取下一页
        next_url = response.xpath('//a//em[contains(text(),"下一页")]/../@href').extract_first()
        full_next_url = base_url+next_url
        if next_url !='javascript:void(0);':
            yield scrapy.Request(url=full_next_url,callback=self.parse)

    def details_infos(self,response):
        babawuliu = Cn56NetItem()
        infos_meta = response.meta
        read_amount = infos_meta['read_amount']

        title = response.xpath('//h2//text()').extract_first()
        babawuliu['title'] = title
        infos = response.xpath('//p[@class="article-box-p"]//text()').extract_first()
        if infos:
            publish_time = re.findall(r'(\d+-\d+-\d+).*',infos)
            publish_time = publish_time[0] if publish_time else ''
            babawuliu['pub_time']=publish_time
            source = re.findall(r'.*来源:(\w+)',infos)
            source = source[0] if source else ''
            babawuliu['source'] = source
        content = response.xpath('//div[@class="article-con"]').extract_first()
        link_url =response.url
        update_time = time.strftime('%Y-%m-%d')
        # 生成的对象
        babawuliu['content'] = content
        babawuliu['url'] = link_url
        babawuliu['read_amount'] = read_amount
        babawuliu['update_time']=update_time
        babawuliu['web'] = '叭叭物流'
        babawuliu['create_time'] = time.strftime('%Y-%m-%d')
        babawuliu['column'] = infos_meta['column']
        babawuliu['author'] = ''
        babawuliu['forward_amount'] = 0
        babawuliu['comment_amount'] = 0
        yield babawuliu

简单的分享如下,如有需要完整的代码,可以留言加好友,这里提供作者完整的scrapy项目

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值