使用python爬取微博数据,使用的代理是蜻蜓代理的隧道代理,包天17元。使用的是request库。有些网页因为网络等原因打不开,总共获取到了76000+数据,没有爬取评论。评论部分的代码调试没有问题,在实际爬取的时候总是报错,可以确定的是代码没问题,可能是网页的问题,可以加一个异常处理,不想写了,评论爬下来也没有很大价值。
这次爬取给我最大的感受就是不可能爬到所有信息,之前没加异常处理总是爬到一半中断了,加了异常处理之后舍弃了那些打不开的网址,整个数据爬取就顺利得多了。
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 2 02:33:00 2020
@author: .....
"""
"""
每一条微博的div个数不同,还是要遍历各个div,至少有两个div
所有url的搜索关键词都是“肺炎”
"""
from lxml import etree
#import requests
import re
from datetime import datetime,timedelta
#import pandas as pd
from openpyxl import Workbook
from urllib import request
import time,random
#获取要爬取的日期列表
def gen_dates(b_date, days):
day = timedelta(days=1)
for i in range(days):
yield b_date + day*i
def get_date_list():
"""
获取日期列表
:param start: 开始日期
:param end: 结束日期
:return:
"""
start = datetime.strptime("xxxx-xx-xx", "%Y-%m-%d").date()
end = datetime.strptime("xxxx-xx-xx", "%Y-%m-%d").date()
#.date()可以只截取日期
datelist = []
for d in gen_dates(start, (end-start).days):
d = str(d).replace('-','')
datelist.append(d)
return datelist
#这个函数来得到指定首尾的中间字符串
def GetMiddleStr(content,startStr,endStr):
patternStr = r'%s(.+?)%s'%(startStr,endStr)
p = re.compile(patternStr,re.IGNORECASE)
m= re.search(p,content)
if m:
return m.group(1)
#这个函数来从博文中得到点赞转发评论量以及时间
def count(strtext):
matchresult = []
matchresult.append(int(GetMiddleStr(strtext,'赞\[',']\xa0')))#点赞量
matchresult.append(int(GetMiddleStr(strtext,'转发\[',']\xa0')))#转发量
matchresult.append(int(GetMiddleStr(strtext,'评论\[',']\xa0')))#评论量
#时间的正则匹配有点复杂,先搁着
# matchresult.append(GetMiddleStr(strtext,'收藏\xa0',''))#时间
return matchresult
#写进文件时的index
columns1 = ['id','博主','博文','原微博博主','原博文','原博文点赞量','原博文转发量','原博文评论量','转发理由','点赞量','转发量','评论量','发布时间','爬取时间']
#columns2 = ['微博id','评论id','评论内容','评论点赞量']
#工作本
workbook = Workbook()
#微博内容存储的sheet
sheet = workbook.active
sheet.title = "微博内容"
sheet.append(columns1)
#便于知道程序运行到了哪里
cishu = 1
# 代理服务器
proxyHost = "dyn.horocn.com"
proxyPort = "50000"
# 代理隧道验证信息
proxyUser = "....."
proxyPass = "....."
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxy_handler = request.ProxyHandler({
"http": proxyMeta,
"https": proxyMeta,
})
opener = request.build_opener(proxy_handler)
opener.addheaders=[('user-agent','.....'),
('cookie','.....')]
request.install_opener(opener)
#日期循环
for date in get_date_list():
# #每一天创建一个sheet存储评论
# comment_sheet = workbook.create_sheet(title = ("评论内容"+date))
# comment_sheet.append(columns2)
base_url = 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%E8%82%BA%E7%82%8E&advancedfilter=1&starttime='+date+'&endtime='+date+'&sort=hot'
try:
#这个请求是为了得到页面数
response = request.urlopen(base_url,timeout = 10).read()
htmlEle = etree.HTML(response)
page_num = int(htmlEle.xpath("//div[last()-6]/form/div/input[1]/@value")[0])
except:
print("获取页面数时出现问题,页面数设置为默认数100")
page_num = 100
#页循环
for page in range(1,page_num+1):
url = base_url+'&page='+str(page)
print(url)
try:
#休眠一个随机数时间,防止ip被封
time.sleep(random.random())
response1 = request.urlopen(url,timeout = 10).read()
htmlEle1 = etree.HTML(response1)
#parser = etree.HTMLParser(encoding = 'utf-8')
##这是我保存的一个网页HTML文件,存储在本计算机上
#htmlEle = etree.parse("测试.html",parser=parser)
divss = htmlEle1.xpath("//div[position()>5 and @class = 'c']")
#每一页的微博循环,每一个divs都是一个微博
for divs in divss:
#最后两个divs不需要,如果循环到了最后两个divs就break
if(divs==divss[len(divss)-2]):
break
else:
#为每一条微博创建一个字典:
weibo = {}
#便于写入文件
weibo_lst = []
div = divs.xpath("./div")
weibo['id'] = str(divs.xpath("@id")[0])
weibo['博主'] = str(divs.xpath("./div[1]/a[1]/text()")[0])
#获取第一个div的span,根据第一个span的类别判断该条微博是转发还是原创
spans = div[0].xpath(".//span")
#如果类别是ctt,则是原创微博,分为有图片和没有图片
if(spans[0].xpath("@class = 'ctt'")):
'''
不管有没有图片,第一个div都是博文内容,必须获取
xpath返回的是一个列表,需要把列表转换成一串字符串,并移除空格
有种情况,博文里可能会出现表情符号,其文字意思存储在span[@class = 'ctt']
里的img标签下,通过//text()获取不到,因此要先判断是否有表情符号
'''
weibo['博文'] = div[0].xpath(".//text()")
if(len(div[0].xpath("./span[@class = 'ctt']/img"))!=0):
weibo['博文'].append(div[0].xpath("./span[@class = 'ctt']/img/@alt")[0])
weibo['博文'] = (''.join([str(i) for i in weibo['博文']])).replace(' ','')
#把转发内容设置为空
weibo['原微博博主'] = "无"
weibo['原博文'] = "无"
weibo['原博文点赞量'] = "无"
weibo['原博文转发量'] = "无"
weibo['原博文评论量'] = "无"
weibo['转发理由'] = "无"
#其他信息有所不同,有图片有两个div,点赞回复数量在第二个div
if(len(div)==2):#有图片的原创微博,从第二个div里获取点赞转发评论量
weibo['点赞量'] = int((div[1].xpath(".//a[3]/text()")[0])[2:-1])#点赞量
weibo['转发量'] = int((div[1].xpath(".//a[4]/text()")[0])[3:-1])#转发量
weibo['评论量'] = int((div[1].xpath(".//a[5]/text()")[0])[3:-1])#评论量
weibo['发布时间'] = str(div[1].xpath("./span/text()")[0])
#没有图片只有一个div,全部信息都在这个div里
else:
weibo['点赞量'] = (count(weibo['博文']))[0]#点赞量
weibo['转发量'] = (count(weibo['博文']))[1]#转发量
weibo['评论量'] = (count(weibo['博文']))[2]#评论量
weibo['发布时间'] = "在博文中"#发布时间
#如果是转发微博
else:
#转发理由就是原创微博的博文,先把weibo['博文']置为空
weibo['博文'] = "无"
#第一个div是被转发的博文的相关信息
weibo['原微博博主'] = str(div[0].xpath("./span[1]/a[1]/text()")[0])
weibo['原博文'] = div[0].xpath("./span[2]//text()")
if(len(div[0].xpath("./span[@class = 'ctt']/img"))!=0):
weibo['原博文'].append(div[0].xpath("./span[@class = 'ctt']/img/@alt")[0])
weibo['原博文'] = (''.join([str(i) for i in weibo['原博文']])).replace(' ','')
"""
如果有三个div说明转发时不仅写了文字,还贴了图,
原博文的点赞转发评论就要在第二个div里找
"""
if(len(div)==3):
weibo['原博文点赞量'] = int((div[1].xpath("./span[1]/text()")[0])[2:-1])
weibo['原博文转发量'] = int((div[1].xpath("./span[2]/text()")[0])[5:-1])
weibo['原博文评论量'] = int((div[1].xpath("./a[@class='cc']/text()")[0])[5:-1])
#否则在第一个div里找
else:
weibo['原博文点赞量'] = (div[0].xpath("./span[3]/text()")[0])[2:-1]
weibo['原博文转发量'] = (div[0].xpath("./span[4]/text()")[0])[5:-1]
#这里用上面那种取出整数的方法会报错,先搁置着
weibo['原博文评论量'] = (div[0].xpath(".//a[@class='cc']/text()")[0])[5:-1]
'''
最后一个div是转发理由以及点赞转发评论量和发布时间
转发理由的处理同博文的处理
'''
weibo['转发理由'] = div[len(div)-1].xpath(".//text()")
if(len(div[len(div)-1].xpath("./span[@class = 'ctt']/img"))!=0):
weibo['转发理由'].append(div[len(div)-1].xpath("./span[@class = 'ctt']/img/@alt")[0])
weibo['转发理由'] = (''.join([str(i) for i in weibo['转发理由']])).replace(' ','')
weibo['点赞量'] = (count(weibo['转发理由']))[0]#点赞量
weibo['转发量'] = (count(weibo['转发理由']))[1]#转发量
weibo['评论量'] = (count(weibo['转发理由']))[2]#评论量
weibo['发布时间'] = "在转发理由中"#发布时间
weibo['爬取时间'] = datetime.now()
#每条微博的字典的值转换成字典,再写进文件
sheet.append(list(weibo.values()))
# #如果评论量不等于0就要爬取详情页面的信息
# if(weibo['评论量']!=0):
# #详情页面的url的共同部分
# base_comment_url = divs.xpath('./div[last()]/a[last()-1]/@href')[0]
# #得到评论的总页面数
# response2 = requests.get(base_comment_url,proxies=proxies,headers = headers)
# htmlEle2 = etree.HTML(response2.content)
# #先看看评论是否有好几页
# if(len(htmlEle2.xpath("//div[@class='pa' and @id='pagelist']"))==0):
# page_num1 = 1
# else:
# page_num1 = int(htmlEle2.xpath("//div[@class='pa' and @id='pagelist']/form/div/input[1]/@value")[0])
#
# #评论页循环
# for page1 in range(1,page_num1+1):
# #每一页的评论的url:
# comment_url = base_comment_url+'&page='+str(page1)
## #休眠一个随机数时间,防止ip被封
## time.sleep(random.random()*3)
# response3 = requests.get(comment_url,proxies=proxies,headers = headers)
# htmlEle3 = etree.HTML(response3.content)
# divs = htmlEle3.xpath("//div[position()>7 and @class = 'c' and @id]")
# #divs数量多了一个
# #评论循环,每一个div都是一个评论
# for div in divs:
# #过滤掉第一页的热门微博,因为在爬取全部微博的时候包括了热门微博
# if(div.xpath("./span[1]/@class")[0]!='kt'):
# #便于写进文件
# comment_lst = []
# #先看点赞量,大于5才会提取评论内容
# comment_agree = str(div.xpath("./span[@class='cc'][1]/a/text()")[0])
# #把数字提取出来
# comment_agree = int(GetMiddleStr(comment_agree,'赞\[',']'))
# if(comment_agree>4):
# print('有评论')
# comment_content = div.xpath("./span[@class='ctt']//text()")
# if(len(div.xpath("./span[@class='ctt']/img"))!=0):
# comment_content.append(div.xpath("./span[@class='ctt']/img/@alt")[0])
# #把列表转化成字符串
# comment_content = (''.join([str(i) for i in comment_content])).replace(' ','')
# comment_id = str(div.xpath("@id")[0])
#
# comment = {'评论id':comment_id,'评论内容':comment_content,'评论点赞量':comment_agree}
# print(comment)
# comment_lst.append(weibo['id'])
# comment_lst.extend(list(comment.values()))
# comment_sheet.append(comment_lst)
print(cishu)
cishu+=1
except:
print("链接出现问题")
workbook.save('微博爬虫.xlsx')