【python 新浪微博爬虫】python 爬取新浪微博热门话题

疫情原因,因为一个学姐做毕设,需要爬微博热门话题。我就拿来练练因为正则表达式玩的不太顶,所以参考了这篇博文python 新浪微博爬虫。因为这篇博文时间太久了,所以加了一些自己的想法和改动。

一、需求分析
模拟登陆新浪微博,爬取新浪微博的热门话题版块的24小时内的前TOP100的话题名称、该话题的阅读数、讨论数、粉丝数、话题主持人,以及对应话题主持人的关注数、粉丝数和微博数。

二、开发语言
python3.6

三、需要导入模块
import requests
import re
import sys
import time
from pyquery import PyQuery as pq
from lxml import etree
import json
import pandas as pd

四、抓取流程
发送请求得到网页源代码数据
之后用正则表达式解析数据

五、字段说明

话题名称:topic_name
阅读数:topic_reading
讨论数:topic_discuss
话题粉丝数:topic_fans
话题主持人:host_name
主持人关注数:host_follow
主持人粉丝数:host_fans
主持人微博数:host_weibo

代码:

import requests
import re
import sys
import time
from pyquery import PyQuery as pq
from lxml import etree
import json
import pandas as pd

global false, null, true
false = null = true = ''
top_name = [] #话题名
top_reading = []#阅读数
top_rank=[]#排名
top_subtitle=[]#标题命
top_fans = []#话题参与人数
host_name = []#发起者名字
host_follow = []#发起者关注
host_fans = []#话题者粉丝
host_weibo = []#话题者微博数

def get_one_page(url):
	headers = {
		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
		'Cookie':'UOR=www.xueshanlinghu.com,widget.weibo.com,www.xueshanlinghu.com; SUB=_2AkMpIlvAf8PxqwJRmPoRz2_lbY9yywvEieKffqobJRMxHRl-yT92qnU6tRB6AqJ1Ja0OS_Z4Sle1i9PePn9Y2j3r002F; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WhMbmKIeS44Ywv2JBYP3dlp; login_sid_t=20e3745b135ada171ade3f91a392cf1f; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=2694303126421.087.1585370372866; SINAGLOBAL=2694303126421.087.1585370372866; ULV=1585370372875:1:1:1:2694303126421.087.1585370372866:; YF-Page-G0=b1c63e15d8892cdaefd40245204f0e21|1585372515|1585372320'
		
			}
	response = requests.get(url,headers=headers,verify=False)
	
	if response.status_code == 200:
		response.encoding='UTF-8'
		return response.text
	return None
	
def analysis(topic):
   
	topicrank = re.search('<span class="(?:DSC_topicon_red|DSC_topicon|DSC_topicon_orange)">(.*?)</span>', topic, re.S)
	if topicrank is None:
		top_rank.append('')
	else:
		top_rank.append(topicrank.group(1))


	topicname= re.search('alt="(.*?)" class="pic">', topic, re.S)
	if topicname is None:
		top_name.append('')
	else:
		top_name.append(topicname.group(1))

	subtitle = re.search('class="subtitle">(.*?)</div>', topic, re.S)
	if subtitle is None:
		top_subtitle.append('')
	else:
		top_subtitle.append(subtitle.group(1))


	readingcount = re.search('<span class="number">(.*?) </span>',topic, re.S)
	if readingcount is None:
		top_reading.append('')
	else:
		top_reading.append(readingcount.group(1))

	ppname=re.search('class="tlink S_txt1"[\s]+>(.*?)</a></div>',topic,re.S)
	if ppname is None:
		host_name.append('')
		host_follow.append('')
		host_fans.append('')
		host_weibo.append('')

	else:
		
		host_name.append(ppname.group(1))
		aboutzcr = re.search('主持人:<span><a target="_blank" href="[^0-9]+(.*?)\?', topic,re.S)
		
		if aboutzcr is not None:
			
			pp1 = "http://m.weibo.cn/api/container/getIndex?type=uid&value=" + str(aboutzcr.group(1))
			r = requests.get(pp1)
			if r.status_code==200:
				html3 =r.text
				html4 = json.dumps(html3)
				content = json.loads(html4)
				jsoncontent = eval(content)
				userInfo = jsoncontent['data']['userInfo']
				statuses_count = userInfo['statuses_count']
				followers_count = userInfo['followers_count']
				follow_count = userInfo['follow_count']
				host_follow.append(follow_count)
				host_fans.append(followers_count)
				host_weibo.append(statuses_count)	
		else:
			host_follow.append('')
			host_fans.append('')
			host_weibo.append('')

	return
def savetoexcel():
	print(len(top_name), len(top_rank), len(top_subtitle), len(top_reading), len(host_name),len(host_follow),len(host_fans),len(host_weibo))

	count=top_name.__len__()
	print(count)
	dfl = pd.DataFrame(data={'top_name': top_name[0:count], 'top_rank': top_rank[0:count], 'top_subtitle': top_subtitle[0:count],
                         'top_reading': top_reading[0:count], 'host_name': host_name[0:count],'host_follow':host_follow[0:count],'host_fan':host_fans[0:count],'host_weibpo':host_weibo[0:count]})

    
	writer = pd.ExcelWriter(r'D:\\sina_weibo_topic50024.xlsx', engine='xlsxwriter',
                            options={'strings_to_urls': False})
	dfl.to_excel(writer, columns=['top_name','top_rank','top_subtitle','top_reading','host_name','host_follow','host_fan','host_weibpo'],index=False)
	writer.close()

    
	return

	
def main():
	for i in range(1,8):
		print("正在抓取第"+str(i)+"页")
		url = "https://d.weibo.com/231650?cfs=920&Pl_Discover_Pt6Rank__3_filter=&Pl_Discover_Pt6Rank__3_page="+str(i)+"#Pl_Discover_Pt6Rank__3"
		html = get_one_page(url)
		handlepage=str(html).replace('\\t', "").replace('\\n', '').replace('\\', '').replace('#', '')
		topic=handlepage.split("pt_li S_line2")
		topic.pop(0)
		for each in topic:
			analysis(each)
		time.sleep(0.5)
		savetoexcel()
	
main()
	

最后附上结果图:
在这里插入图片描述

  • 4
    点赞
  • 96
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值