scrapy-爬取百度贴吧之物流内容。

今天作者分享的是工作中所做的舆情系统,爬取百度贴吧中的各个物流贴吧网站的信息,scrapy用的不是特别多可能代码会有点low,希望大家可以见谅哈。直接上代码如下:
items文件中的信息:

class LouzhuItem(scrapy.Item):
    """记录楼主信息"""
    lzhu_name = scrapy.Field()     # 楼主名字
    lzhu_id = scrapy.Field()       # 楼主ID
    lzhu_level = scrapy.Field()    # 楼主等级
    title = scrapy.Field()         # 帖子标题
    title_id = scrapy.Field()      # 帖子的标题id
    tieba_name = scrapy.Field()    # 贴吧的名字
    lcreate_time = scrapy.Field()  # 帖子创建时间
    tz_url = scrapy.Field()        # 帖子URL
    pages = scrapy.Field()         # 帖子总页数
    update_time = scrapy.Field()   # 更新帖子的时间



class CengzhuItem(scrapy.Item):
    """记录层主信息"""
    czhu_name = scrapy.Field()     # 层主名字
    czhu_id = scrapy.Field()       # 层主ID
    czhu_level = scrapy.Field()    # 层主等级
    content = scrapy.Field()       # 该层文字内容
    content_id = scrapy.Field()    # 该层文字内容的id
    tieba_name = scrapy.Field()    # 贴吧的名字
    ccreate_time = scrapy.Field()  # 该层创建时间
    tiezi_num = scrapy.Field()     # 帖子的回复量
    url = scrapy.Field()           # 帖子的地址
    update_time = scrapy.Field()   # 更新帖子的时间
middlewares.py 文件这里没做修改你也可以直接使用默认的。
pipelines.py  文件用于数据的插入作者这里直接上代码如下:
import re
import time
import json
import sqlalchemy
import requests
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from cn56_net.items import Cn56NetItem,LouzhuItem,CengzhuItem
from cn56_net.manage_db.models import LouZhuInfo, CengZhuInfo, News

# engine = create_engine("mysql+pymysql://root:root@ip:3306/test?charset=utf8")
engine = create_engine("mysql+pymysql://root:@ip:3306/test?charset=utf8")
Session = sessionmaker(bind=engine)


class MysqlPipeline(object):
    def __init__(self):
        self.session = Session()
    @staticmethod
    def filter_emoji(desstr,restr=''):       // 过滤表情,当然这里没有用到,python2可以用这个方法
        try:
            co = re.compile('[\U00010000-\U0010ffff]')
        except re.error:
            co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
        return co.sub(restr, desstr)

    def process_item(self, item, spider):
        if isinstance(item, LouzhuItem):
            # safe_name = self.filter_emoji(item['lzhu_name'])
            # safe_title = self.filter_emoji(item['title'])
            safe_name = item['lzhu_name']
            safe_title = item['title']
            add_lz = LouZhuInfo(lz_name=safe_name,
                                lz_id=item['lzhu_id'],
                                lz_level=item['lzhu_level'],
                                title=safe_title,
                                tieba_name=item['tieba_name'],
                                create_time=item['lcreate_time'],
                                url=item['tz_url'],
                                pages=item['pages'],
                                title_id=item['title_id'],
                                update_time = item['update_time'])
            self.session.add(add_lz)
            self.session.commit()
            self.session.close()
        elif isinstance(item, CengzhuItem):
            # safe_name = self.filter_emoji(item['czhu_name'])
            safe_name = item['czhu_name']
            # safe_content = self.filter_emoji(item['content'])
            safe_content = item['content']
            add_cz = CengZhuInfo(czhu_name=safe_name,
                                 czhu_id=item['czhu_id'],
                                 czhu_level=item['czhu_level'],
                                 content=safe_content,
                                 content_id = item['content_id'],
                                 tieba_name=item['tieba_name'],
                                 ccreate_time=item['ccreate_time'],
                                 tiezi_num=item['tiezi_num'],
                                 url=item['url'],
                                 update_time=item['update_time'])
            self.session.add(add_cz)
            self.session.commit()
            self.session.close()
        elif isinstance(item, Cn56NetItem):
            query = self.session.query(News)
            r = query.filter_by(url=item['url'], pub_time=item["pub_time"]).first()
            if r:
                print ("\n\n", '*'*10, 'NewsItem no changes, update', '*'*10)
                pass
            else:
                create_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                add_news = News(source=item["source"],
                                title=item["title"],
                                web=item["web"],
                                column=item["column"],
                                author=item["author"],
                                content=item["content"],
                                forward_amount=item["forward_amount"],
                                comment_amount=item["comment_amount"],
                                read_amount=item["read_amount"],
                                url=item["url"],
                                pub_time=item["pub_time"],
                                create_time=create_time,
                                update_time=item['update_time'])
                self.session.add(add_news)
                self.session.commit()
                self.session.close()
        return item
settings文件需要修改的有:

'''限制爬取的速度延迟1秒爬取一个'''
DOWNLOAD_DELAY = 1

ITEM_PIPELINES = {
   # 'cn56_net.pipelines.Cn56NetPipeline': 300,
    'cn56_net.pipelines.MysqlPipeline': 300,
}

# 添加MySQL到setting中
MYSQL_HOST='IP'
MYSQL_DBNAME='test'
MYSQL_USER='root'
MYSQL_PASSWORD='root'

# 自己添加的内容
HTTPERROR_ALLOWED_CODES = [400]#上面报的是403,就把403加入。


# 定时关闭爬虫的代码
CLOSESPIDER_TIMEOUT = 82800 # 23小时后结束爬虫

好了上边已经做好了,紧接着现在开始做代码spiders.py文件代码如下:

# -*- coding: utf-8 -*-
import scrapy
import time
import demjson
import re
from cn56_net.items import LouzhuItem, CengzhuItem


class TiebaSpider(scrapy.Spider):
    name = 'tieba'
    allowed_domains = ['tieba.baidu.com']
    custom_settings = {
        'LOG_LEVEL': 'DEBUG',
        'LOG_FILE': 'tieba_log_%s.txt' % time.time(),
    }
    start_urls = ['http://tieba.baidu.com/f?ie=utf-8&kw=中通&fr=search',
                  'http://tieba.baidu.com/f?ie=utf-8&kw=德邦物流&fr=search',
                  'http://tieba.baidu.com/f?ie=utf-8&kw=圆通&fr=search',
                  'http://tieba.baidu.com/f?ie=utf-8&kw=申通&fr=search',
                  'http://tieba.baidu.com/f?ie=utf-8&kw=韵达&fr=search',
                  'https://tieba.baidu.com/f?ie=utf-8&kw=顺丰&fr=search',
                  'http://tieba.baidu.com/f?ie=utf-8&kw=天天快递&fr=search',
                  'https://tieba.baidu.com/f?ie=utf-8&kw=ems&fr=search',
                  'https://tieba.baidu.com/f?ie=utf-8&kw=百事汇通&fr=search',]

    def parse(self, response):
        # 贴吧的名字
        tieba_name1 = response.xpath('//div[@class="card_title"]/a/text()').extract_first().strip()
        teiba_name = {
            'teiba_name': tieba_name1
        }
        # 具体贴吧网址的获取
        details_url = response.xpath('//div[@class="threadlist_title pull_left j_th_tit "]//a[@rel="noreferrer"]/@href').extract()
        for each_url in details_url:
            infos_url = 'http://tieba.baidu.com'+each_url
            yield scrapy.Request(url=infos_url,callback=self.details_infos,meta=teiba_name)

        # 下一页的路径的获取
        next_url = response.xpath('//a[@class="next pagination-item "]/@href').extract_first()
        self.logger.info('打印的是下一页的路径%s'%next_url)
        if next_url:
            next_url = 'http:'+next_url
            yield scrapy.Request(url=next_url,callback=self.parse)

    def details_infos(self,details_response):
        teiba_name = details_response.meta
        # 贴吧的名字
        pre_tieba_name = teiba_name['teiba_name']
        print (pre_tieba_name)
        lou_zhu = LouzhuItem()
        ceng_zhu = CengzhuItem()
        # 楼主的信息的获取
        # lou_zhu_infos = details_response.xpath('//div[@class="l_post j_l_post l_post_bright noborder "]/@data-field').extract_first()
        lou_zhu_infos = details_response.xpath('//div[@id="j_p_postlist"]/div[1]/@data-field').extract_first()
        try:
            lou_zhu_infos = demjson.decode(lou_zhu_infos)
        except BaseException as e:
            self.logger.exception("json.loads error: %s" % details_response.url)
            return
        # 楼主的id
        lou_zhu_id = lou_zhu_infos['author']
        lou_zhu_id = lou_zhu_id.get('user_id')
        # 楼主的名字
        lou_zhu_name = lou_zhu_infos['author']
        lou_zhu_name = lou_zhu_name.get('user_name')
        # 楼主的等级
        lou_zhu_level = lou_zhu_infos['author']
        lou_zhu_level = lou_zhu_level.get('level_id')
        # 楼主发表的贴子的标题
        # title = details_response.xpath('//h1//text()').extract_first()
        title = details_response.xpath('//div[@id="j_core_title_wrap"]//h3/text()|//div[@id="j_core_title_wrap"]//h1/text()').extract_first()
        # 标题内容的id
        content = lou_zhu_infos['content']
        title_id = content.get('post_id')
        # 帖子的创建时间
        create_time = lou_zhu_infos['content']
        create_time = create_time.get('date')
        # 帖子的评论的总页数
        lou_zhu_page = details_response.xpath('//span[@class="red"]//text()').extract()
        lou_zhu_page = lou_zhu_page[-1]
        lou_zhu_url = details_response.url
        '''生成楼主的对象'''
        lou_zhu['lzhu_name'] = lou_zhu_name
        lou_zhu['lzhu_id'] = lou_zhu_id
        lou_zhu['lzhu_level'] = lou_zhu_level
        lou_zhu['title'] = title
        lou_zhu['title_id'] = title_id
        lou_zhu['lcreate_time'] = create_time
        lou_zhu['tieba_name'] = pre_tieba_name
        lou_zhu['pages'] = lou_zhu_page
        lou_zhu['tz_url'] = lou_zhu_url
        lou_zhu['update_time'] = time.strftime('%Y-%m-%d')
        yield lou_zhu

        # 回复贴的信息的获取
        reply_infos = details_response.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
        if reply_infos:
            for each_cengzhu in reply_infos:
                # 获取每个层主的信息
                ceng_zhu_infos = each_cengzhu.xpath('./@data-field').extract_first()
                ceng_zhu_infos = demjson.decode(ceng_zhu_infos)
                # 层主的姓名
                ceng_zhu_name = ceng_zhu_infos['author']
                ceng_zhu_name = ceng_zhu_name.get('user_name') if ceng_zhu_name else ''
                # 层主的id
                ceng_zhu_id = ceng_zhu_infos['author']
                ceng_zhu_id = ceng_zhu_id.get('user_id') if ceng_zhu_id else ''
                # 层主的等级
                ceng_zhu_level = ceng_zhu_infos['author']
                ceng_zhu_level = ceng_zhu_level.get('level_id')
                # 评论量
                ceng_zhu_comment_num = ceng_zhu_infos['content']
                ceng_zhu_comment_num = ceng_zhu_comment_num.get('comment_num')
                # 发表的内容
                ceng_zhu_content = each_cengzhu.xpath('.//div[@class="d_post_content j_d_post_content  clearfix"]//text()').extract_first('')
                ceng_zhu_content = ceng_zhu_content.strip()     # 除去首行缩进
                # 内容的id
                content_id = ceng_zhu_infos['content']
                content_id = content_id.get('post_id')
                # 帖子的创建时间
                ceng_zhu_time = ceng_zhu_infos['content']
                ceng_zhu_time = ceng_zhu_time.get('date')
                # 帖子的地址
                ceng_zhu_adress = details_response.url
                '''生成层主的对象'''
                ceng_zhu['tieba_name'] = pre_tieba_name
                ceng_zhu['czhu_name'] = ceng_zhu_name
                ceng_zhu['czhu_id'] = ceng_zhu_id
                ceng_zhu['czhu_level'] = ceng_zhu_level
                ceng_zhu['content'] = ceng_zhu_content
                ceng_zhu['content_id'] = content_id
                ceng_zhu['ccreate_time'] = ceng_zhu_time
                ceng_zhu['tiezi_num'] = ceng_zhu_comment_num
                ceng_zhu['url'] = ceng_zhu_adress
                ceng_zhu['update_time'] = time.strftime('%Y-%m-%d')

                yield ceng_zhu

好了,上述的代码算是分享结束了,可能scrapy用的还是有点low,因为工作中牵涉的比较少,后续会努力学习的。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值