今天作者分享的是工作中所做的舆情系统,爬取百度贴吧中的各个物流贴吧网站的信息,scrapy用的不是特别多可能代码会有点low,希望大家可以见谅哈。直接上代码如下:
items文件
中的信息:
class LouzhuItem(scrapy.Item):
"""记录楼主信息"""
lzhu_name = scrapy.Field() # 楼主名字
lzhu_id = scrapy.Field() # 楼主ID
lzhu_level = scrapy.Field() # 楼主等级
title = scrapy.Field() # 帖子标题
title_id = scrapy.Field() # 帖子的标题id
tieba_name = scrapy.Field() # 贴吧的名字
lcreate_time = scrapy.Field() # 帖子创建时间
tz_url = scrapy.Field() # 帖子URL
pages = scrapy.Field() # 帖子总页数
update_time = scrapy.Field() # 更新帖子的时间
class CengzhuItem(scrapy.Item):
"""记录层主信息"""
czhu_name = scrapy.Field() # 层主名字
czhu_id = scrapy.Field() # 层主ID
czhu_level = scrapy.Field() # 层主等级
content = scrapy.Field() # 该层文字内容
content_id = scrapy.Field() # 该层文字内容的id
tieba_name = scrapy.Field() # 贴吧的名字
ccreate_time = scrapy.Field() # 该层创建时间
tiezi_num = scrapy.Field() # 帖子的回复量
url = scrapy.Field() # 帖子的地址
update_time = scrapy.Field() # 更新帖子的时间
middlewares.py 文件这里没做修改你也可以直接使用默认的。
pipelines.py 文件用于数据的插入作者这里直接上代码如下:
import re
import time
import json
import sqlalchemy
import requests
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from cn56_net.items import Cn56NetItem,LouzhuItem,CengzhuItem
from cn56_net.manage_db.models import LouZhuInfo, CengZhuInfo, News
# engine = create_engine("mysql+pymysql://root:root@ip:3306/test?charset=utf8")
engine = create_engine("mysql+pymysql://root:@ip:3306/test?charset=utf8")
Session = sessionmaker(bind=engine)
class MysqlPipeline(object):
def __init__(self):
self.session = Session()
@staticmethod
def filter_emoji(desstr,restr=''): // 过滤表情,当然这里没有用到,python2可以用这个方法
try:
co = re.compile('[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, desstr)
def process_item(self, item, spider):
if isinstance(item, LouzhuItem):
# safe_name = self.filter_emoji(item['lzhu_name'])
# safe_title = self.filter_emoji(item['title'])
safe_name = item['lzhu_name']
safe_title = item['title']
add_lz = LouZhuInfo(lz_name=safe_name,
lz_id=item['lzhu_id'],
lz_level=item['lzhu_level'],
title=safe_title,
tieba_name=item['tieba_name'],
create_time=item['lcreate_time'],
url=item['tz_url'],
pages=item['pages'],
title_id=item['title_id'],
update_time = item['update_time'])
self.session.add(add_lz)
self.session.commit()
self.session.close()
elif isinstance(item, CengzhuItem):
# safe_name = self.filter_emoji(item['czhu_name'])
safe_name = item['czhu_name']
# safe_content = self.filter_emoji(item['content'])
safe_content = item['content']
add_cz = CengZhuInfo(czhu_name=safe_name,
czhu_id=item['czhu_id'],
czhu_level=item['czhu_level'],
content=safe_content,
content_id = item['content_id'],
tieba_name=item['tieba_name'],
ccreate_time=item['ccreate_time'],
tiezi_num=item['tiezi_num'],
url=item['url'],
update_time=item['update_time'])
self.session.add(add_cz)
self.session.commit()
self.session.close()
elif isinstance(item, Cn56NetItem):
query = self.session.query(News)
r = query.filter_by(url=item['url'], pub_time=item["pub_time"]).first()
if r:
print ("\n\n", '*'*10, 'NewsItem no changes, update', '*'*10)
pass
else:
create_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
add_news = News(source=item["source"],
title=item["title"],
web=item["web"],
column=item["column"],
author=item["author"],
content=item["content"],
forward_amount=item["forward_amount"],
comment_amount=item["comment_amount"],
read_amount=item["read_amount"],
url=item["url"],
pub_time=item["pub_time"],
create_time=create_time,
update_time=item['update_time'])
self.session.add(add_news)
self.session.commit()
self.session.close()
return item
settings文件需要修改的有:
'''限制爬取的速度延迟1秒爬取一个'''
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
# 'cn56_net.pipelines.Cn56NetPipeline': 300,
'cn56_net.pipelines.MysqlPipeline': 300,
}
# 添加MySQL到setting中
MYSQL_HOST='IP'
MYSQL_DBNAME='test'
MYSQL_USER='root'
MYSQL_PASSWORD='root'
# 自己添加的内容
HTTPERROR_ALLOWED_CODES = [400]#上面报的是403,就把403加入。
# 定时关闭爬虫的代码
CLOSESPIDER_TIMEOUT = 82800 # 23小时后结束爬虫
好了上边已经做好了,紧接着现在开始做代码spiders.py
文件代码如下:
# -*- coding: utf-8 -*-
import scrapy
import time
import demjson
import re
from cn56_net.items import LouzhuItem, CengzhuItem
class TiebaSpider(scrapy.Spider):
name = 'tieba'
allowed_domains = ['tieba.baidu.com']
custom_settings = {
'LOG_LEVEL': 'DEBUG',
'LOG_FILE': 'tieba_log_%s.txt' % time.time(),
}
start_urls = ['http://tieba.baidu.com/f?ie=utf-8&kw=中通&fr=search',
'http://tieba.baidu.com/f?ie=utf-8&kw=德邦物流&fr=search',
'http://tieba.baidu.com/f?ie=utf-8&kw=圆通&fr=search',
'http://tieba.baidu.com/f?ie=utf-8&kw=申通&fr=search',
'http://tieba.baidu.com/f?ie=utf-8&kw=韵达&fr=search',
'https://tieba.baidu.com/f?ie=utf-8&kw=顺丰&fr=search',
'http://tieba.baidu.com/f?ie=utf-8&kw=天天快递&fr=search',
'https://tieba.baidu.com/f?ie=utf-8&kw=ems&fr=search',
'https://tieba.baidu.com/f?ie=utf-8&kw=百事汇通&fr=search',]
def parse(self, response):
# 贴吧的名字
tieba_name1 = response.xpath('//div[@class="card_title"]/a/text()').extract_first().strip()
teiba_name = {
'teiba_name': tieba_name1
}
# 具体贴吧网址的获取
details_url = response.xpath('//div[@class="threadlist_title pull_left j_th_tit "]//a[@rel="noreferrer"]/@href').extract()
for each_url in details_url:
infos_url = 'http://tieba.baidu.com'+each_url
yield scrapy.Request(url=infos_url,callback=self.details_infos,meta=teiba_name)
# 下一页的路径的获取
next_url = response.xpath('//a[@class="next pagination-item "]/@href').extract_first()
self.logger.info('打印的是下一页的路径%s'%next_url)
if next_url:
next_url = 'http:'+next_url
yield scrapy.Request(url=next_url,callback=self.parse)
def details_infos(self,details_response):
teiba_name = details_response.meta
# 贴吧的名字
pre_tieba_name = teiba_name['teiba_name']
print (pre_tieba_name)
lou_zhu = LouzhuItem()
ceng_zhu = CengzhuItem()
# 楼主的信息的获取
# lou_zhu_infos = details_response.xpath('//div[@class="l_post j_l_post l_post_bright noborder "]/@data-field').extract_first()
lou_zhu_infos = details_response.xpath('//div[@id="j_p_postlist"]/div[1]/@data-field').extract_first()
try:
lou_zhu_infos = demjson.decode(lou_zhu_infos)
except BaseException as e:
self.logger.exception("json.loads error: %s" % details_response.url)
return
# 楼主的id
lou_zhu_id = lou_zhu_infos['author']
lou_zhu_id = lou_zhu_id.get('user_id')
# 楼主的名字
lou_zhu_name = lou_zhu_infos['author']
lou_zhu_name = lou_zhu_name.get('user_name')
# 楼主的等级
lou_zhu_level = lou_zhu_infos['author']
lou_zhu_level = lou_zhu_level.get('level_id')
# 楼主发表的贴子的标题
# title = details_response.xpath('//h1//text()').extract_first()
title = details_response.xpath('//div[@id="j_core_title_wrap"]//h3/text()|//div[@id="j_core_title_wrap"]//h1/text()').extract_first()
# 标题内容的id
content = lou_zhu_infos['content']
title_id = content.get('post_id')
# 帖子的创建时间
create_time = lou_zhu_infos['content']
create_time = create_time.get('date')
# 帖子的评论的总页数
lou_zhu_page = details_response.xpath('//span[@class="red"]//text()').extract()
lou_zhu_page = lou_zhu_page[-1]
lou_zhu_url = details_response.url
'''生成楼主的对象'''
lou_zhu['lzhu_name'] = lou_zhu_name
lou_zhu['lzhu_id'] = lou_zhu_id
lou_zhu['lzhu_level'] = lou_zhu_level
lou_zhu['title'] = title
lou_zhu['title_id'] = title_id
lou_zhu['lcreate_time'] = create_time
lou_zhu['tieba_name'] = pre_tieba_name
lou_zhu['pages'] = lou_zhu_page
lou_zhu['tz_url'] = lou_zhu_url
lou_zhu['update_time'] = time.strftime('%Y-%m-%d')
yield lou_zhu
# 回复贴的信息的获取
reply_infos = details_response.xpath('//div[@class="l_post j_l_post l_post_bright "]')
if reply_infos:
for each_cengzhu in reply_infos:
# 获取每个层主的信息
ceng_zhu_infos = each_cengzhu.xpath('./@data-field').extract_first()
ceng_zhu_infos = demjson.decode(ceng_zhu_infos)
# 层主的姓名
ceng_zhu_name = ceng_zhu_infos['author']
ceng_zhu_name = ceng_zhu_name.get('user_name') if ceng_zhu_name else ''
# 层主的id
ceng_zhu_id = ceng_zhu_infos['author']
ceng_zhu_id = ceng_zhu_id.get('user_id') if ceng_zhu_id else ''
# 层主的等级
ceng_zhu_level = ceng_zhu_infos['author']
ceng_zhu_level = ceng_zhu_level.get('level_id')
# 评论量
ceng_zhu_comment_num = ceng_zhu_infos['content']
ceng_zhu_comment_num = ceng_zhu_comment_num.get('comment_num')
# 发表的内容
ceng_zhu_content = each_cengzhu.xpath('.//div[@class="d_post_content j_d_post_content clearfix"]//text()').extract_first('')
ceng_zhu_content = ceng_zhu_content.strip() # 除去首行缩进
# 内容的id
content_id = ceng_zhu_infos['content']
content_id = content_id.get('post_id')
# 帖子的创建时间
ceng_zhu_time = ceng_zhu_infos['content']
ceng_zhu_time = ceng_zhu_time.get('date')
# 帖子的地址
ceng_zhu_adress = details_response.url
'''生成层主的对象'''
ceng_zhu['tieba_name'] = pre_tieba_name
ceng_zhu['czhu_name'] = ceng_zhu_name
ceng_zhu['czhu_id'] = ceng_zhu_id
ceng_zhu['czhu_level'] = ceng_zhu_level
ceng_zhu['content'] = ceng_zhu_content
ceng_zhu['content_id'] = content_id
ceng_zhu['ccreate_time'] = ceng_zhu_time
ceng_zhu['tiezi_num'] = ceng_zhu_comment_num
ceng_zhu['url'] = ceng_zhu_adress
ceng_zhu['update_time'] = time.strftime('%Y-%m-%d')
yield ceng_zhu
好了,上述的代码算是分享结束了,可能scrapy用的还是有点low,因为工作中牵涉的比较少,后续会努力学习的。