1.基本情况介绍
- 平台:PyCharm + Python3.9 + Windows
- 需求:未登录状态下对指定微博账号基本公开信息的搜集于存储
- 功能:
- 1.根据账号昵称返回用户uid;
- 2.获取账号基本公开信息;
- 3.获取用户所用原创微博;
- 4.获取用户所有原创图片
- 5.获取指定微博的评论
- 未解决的问题:
- 1.未登录状态下所获取的基本信息不全,不同的客户源的信息也有差别,如加入微博时间、教育经历等;
- 2.当用户博文是长文本时,有时会让登录,否则无法获取原文,且会抛出异常;
- 3.最新发布的长文本博客无法获取,需登录
- 4.m.weibo.com只能爬取50页,这个应该是被官方开发人员写死了,目前在未登录状态下无法突破限制
2.代码实现
2.1所用到的库
import requests
from requests import exceptions
from pyquery import PyQuery as pq
import re
import json
import time
import os # 创建文件目录
2.2 功能函数
2.2.1 构建网页query参数格式
# 获取链接参数 手机端 m.weibo.com
# mode 0:博文链接参数 page必填;1:博文评论详情,mid必填;2:博主信息页
def build_param(value, mode, page='', mid=''):
if mode == 0:
param = {
'type': 'uid',
'value': str(value),
'containerid': '107603' + str(value),
'page': str(page)
}
elif mode == 1: # 评论详情页面参数
param = {
'id': str(mid),
'mid': str(mid),
'max_id_type': '0'
}
elif mode == 2: # “微博”页面参数
param = {
'type': 'uid',
'value': str(value),
'containerid': '100505' + str(value),
}
elif mode == 3: # “主页”参数
param = {
'type': 'uid',
'value': str(value),
'containerid': '230283' + str(value),
}
elif mode == 4: # "热门"参数
param = {
'containerid': '231002' + str(value) + '_-_HOTMBLOG'
}
return param
2.2.2 构建请求头
# 构建Headers
# mode 0:XHR 1:normal
def build_headers(mode, value):
headers = {}
headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
headers['referer'] = 'https://m.weibo.cn/u/' + str(value)
if mode == 0:
headers['X-Requested-With'] = 'XMLHttpRequest'
return headers
2.2.3 获取博主uid
# 根据微博名获取用户uid
# 返回两个参数 uid str类型 ;name为博主昵称
def get_userid():
username = input("请输入要查询微博用户的完整名字:")
url = 'https://s.weibo.com/user?q=' + username
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
doc = pq(url, headers)
target_names = doc('.info .name').items()
index = 0
for item in target_names:
index += 1
print('搜索结果:' + str(index), item.text())
select_num = input('请输入指定的搜索序号:')
target_name = list(doc('.info .name').items())[int(select_num)-1].text()
target_uid = list(doc('.info .s-btn-c').items())[int(select_num) - 1].attr(
'uid')
# print(target_name + ': ' + target_uid)
return target_uid, target_name
效果:
2.2.4 创建文件目录
# 创建文件目录
# 返回自定义的文件路径和文件名称
def build_dirs():
root_name = input('请输入文件名:')
filename = 'F://UsualTasks/微博/' + root_name
if os.path.exists(filename):
print("The file was already existed.")
else:
os.makedirs(filename)
os.makedirs(filename + '/图片')
os.makedirs(filename + '/原创博文')
return filename, root_name
2.2.5 获取博主基本公开信息
# 获取博主基本公开信息;将信息写入指定文件夹下
# 传入两个参数 value博主的uid;filename为
# 从三个不同的网页获取基本信息,发出三个请求
def get_basic_info(value, filename):
try:
base_url = 'https://m.weibo.cn/api/container/getIndex?'
blogger_Info = {}
params = build_param(value, mode=2)
params_hot = build_param(value, mode=4)
params_location = build_param(value, mode=3)
headers = build_headers(mode=0, value=value)
response = requests.get(base_url, params=params, headers=headers)
res_hot = requests.get(base_url, params=params_hot, headers=headers)
res_location = requests.get(base_url,params=params_location,headers=headers)
response.encoding = response.apparent_encoding
res_hot.encoding = res_hot.apparent_encoding
res_location.encoding = res_location.apparent_encoding
if response.status_code == 200:
json_text = response.json()
items = json_text.get('data').get('userInfo')
blogger_Info['博主'] = items.get('screen_name')
blogger_Info['id'] = items.get('id')
blogger_Info['性别'] = items.get('gender')
if len(items.get('description')) == 0:
blogger_Info['个人描述'] = '无'
else:
blogger_Info['个人描述'] = items.get('description')
blogger_Info['关注人数'] = items.get('follow_count')
blogger_Info['粉丝数'] = items.get('followers_count')
blogger_Info['微博总数'] = items.get('statuses_count')
if str(items.get('verified')) == 'True':
blogger_Info['认证'] = items.get('verified_reason')
else:
blogger_Info['认证'] = '普通用户'
if res_location.status_code == 200:
json_location = res_location.json()
blogger_Info['所在地'] = json_location.get('data').get('cards')[
0].get('card_group')[0].get('item_content')
if res_hot.status_code == 200:
json_hot = res_hot.json()
blogger_Info['热门内容条数'] = json_hot.get('data').get('cards')[
0].get('card_group')[0].get(
'desc2')[7:]
if blogger_Info['热门内容条数'] != '0':
blogger_Info['被转发总数'] = json_hot.get('data').get('cards')[0].get('card_group')[1].get('group')[0].get('item_title')
blogger_Info['被评论总数'] = json_hot.get('data').get('cards')[0].get('card_group')[1].get('group')[1].get('item_title')
blogger_Info['被赞总数'] = json_hot.get('data').get('cards')[0].get('card_group')[1].get('group')[2].get('item_title')
with open(filename[0] + '/' + filename[1] + '-微博公开信息.txt', 'a',
encoding='utf-8') as f:
f.write(json.dumps(blogger_Info, ensure_ascii=False))
except exceptions.RequestException as e:
print(e)
2.2.6 获取网页源代码
# 获取博主最新页的源代码Json格式
# 传入两个参数 params构建query类型参数;value博主的uid
# 返回两个参数 网页Json代码 当前页码参数
# pages参数未处理完美,不影响实际爬取
def get_paper(params, value):
try:
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = build_headers(0, value=value)
response = requests.get(base_url, params=params, headers=headers,
timeout=4)
response.encoding = response.apparent_encoding
if response.status_code == 200:
json_text = response.json()
if json_text.get('data').get('cardlistInfo').get('page'):
pages = json_text.get('data').get('cardlistInfo').get('page')
return json_text, pages
except exceptions.RequestException as e:
print(e)
2.2.7 获取博主原创博文的图片
# 获取博主原创博文中的图片;将图片保存到指定文件夹下
# 无时间限制,可以获取最新发布的图片
# 传入两个参数 json_text,filename
def get_original_img(json_text, filename):
if json_text:
items = json_text.get('data').get('cards')
for item in items:
item = item.get('mblog')
if ('pics' in item) and ('retweeted_status' not in item):
img_time = item.get('created_at')
items_seconds = item.get('pics')
for item in items_seconds:
img_url = item.get('url')
response = requests.get(img_url)
img_name = img_time[4:10] + img_time[-5:] + '.jpg'
with open(filename[0] + '/图片/' + img_name, 'wb') as f:
f.write(response.content)
time.sleep(0.1)
2.2.8 获取长文本类型博文
# 获取微博正文——长文本
# 发文时间超过8小时的可以执行此功能
# 和发布时间长短好像没有关系,经测试会出现需要登录的问题
# 在get_original()中有调用
def get_longtext(long_id, value):
try:
url = 'https://m.weibo.cn/detail/' + str(long_id)
response = requests.get(
url, headers=build_headers(
mode=1, value=value))
response.encoding = response.apparent_encoding
if response.status_code == 200:
pattern = re.compile(r'var \$render_data = (.*)[\s\S]{7}{};',
re.MULTILINE | re.DOTALL)
items = pattern.findall(response.text)[0]
chars = json.loads(items)[0]['status']['text']
return chars
except exceptions.RequestException as e:
print(e)
2.2.9 获取博主所有原创博文
# 功能函数get_original(json_text, value)
# 获取博主所有原创博文
# 传入三个参数
def get_original(json_text, value, filename):
if json_text:
items = json_text.get('data').get('cards')
for item in items:
blog_num = 0
item = item.get('mblog')
if 'retweeted_status' not in item:
orignal_blog = {}
orignal_blog['发布时间'] = item.get('created_at') # 发布时间
orignal_blog['发布工具'] = item.get('source') # 发布工具
orignal_blog['是否是长文本'] = str(item.get('isLongText'))
if orignal_blog['是否是长文本'] == 'True':
orignal_blog['长文本mid'] = item.get('mid')
orignal_blog['发布内容'] = get_longtext(item.get('mid'),
value)
else:
orignal_blog['发布内容'] = pq(item.get('text')).text() #
# 博客内容
orignal_blog['点赞数'] = item.get('attitudes_count')
# 点赞数
orignal_blog['评论数目'] = item.get('comments_count')
# 评论数目
orignal_blog['转发数目'] = item.get('reposts_count')
# 转发数目
blog_name = orignal_blog['发布时间'][4:16].replace(':','') + \
orignal_blog['发布时间'][
-5:] + '第' + str(
blog_num) + '篇.jpg'
with open(
filename[0] + '/原创博文/' + blog_name + '.txt', 'w',
encoding='utf-8') as f:
f.write(json.dumps(orignal_blog, ensure_ascii=False))
2.2.10 获取指定博文评论
# 获取指定微博评论信息
# 未登录状态只能爬取一页,算是热评
# mid 详情参数
# 返回yield生成器类型
def get_targeted_comment(value, mid):
try:
index = 0
params = build_param(value, mode=1, mid=mid)
url = 'https://m.weibo.cn/comments/hotflow?'
headers = build_headers(mode=0, value=value)
response = requests.get(url, params=params, headers=headers, timeout=4)
response.encoding = response.apparent_encoding
if response.status_code == 200:
json_text = response.json()
if json_text:
items = json_text.get('data').get('data')
for item in items:
comments_info = {}
index += 1
comments_info['序号'] = index
comments_info['评论人'] = item.get('user').get('screen_name')
comments_info['id'] = item.get('user').get('id')
if item.get('user').get('verified') != 'False':
comments_info['认证'] = item.get('user').get(
'verified_reason')
else:
comments_info['认证'] = '普通用户'
comments_info['内容'] = item.get('text')
comments_info['赞'] = item.get('like_count')
comments_info['被回复数'] = item.get('total_number')
if comments_info['被回复数'] != 0:
item = item.get('comments')
if str(item) != 'False':
if comments_info['被回复数'] == 1:
comments_info['被回复文本1'] = item[0].get('text')
if comments_info['被回复数'] >= 2:
comments_info['被回复文本1'] = item[0].get('text')
comments_info['被回复文本2'] = item[1].get('text')
else:
comments_info['被回复文本作废'] = '无实际内容,可能已删除'
yield comments_info
except exceptions.HTTPError as e:
print(e)
2.2.11 在微博搜索中找人
“所见即能获取”,要获取第一个用户的信息
def get_info():
url = 'https://s.weibo.com/user?q=' + input('请输入完整确切的博主名称')
doc = pq(url)
info ={}
target_names = list(doc('.info').items())[0]
info['博主'] = list(doc('.info .name').items())[0].text()
num = target_names('p').length
for i in range(2, num+2):
index = 'p:nth-child('+str(i)+')'
key = str(i-1)
info[key] = target_names('.info > ' + index).text()
with open('info_data.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(info, ensure_ascii=False) + '\n')
2.3 在main()函数中调用、整合
def main():
json_list = (1, 1) # 初始化第一个传递json文本,第二个传递page参数
value = get_userid() # 获取博主uid
filename = build_dirs() # 创建存储的文件目录
get_basic_info(value[0], filename) # 获取基本信息
while json_list[1] != 'Null': # 循环判断台条件,最后一页的page是Null
params = build_param(value[0], mode=0, page=json_list[1]) # 构建query参数格式
json_list = get_paper(params, value[0]) # 获取json文本和page参数
get_original_img(json_list[0], filename=filename) # 获取原创博文图片
get_original(json_list[0], value[0], filename) # 获取原创博文
time.sleep(0.1)
2.4 设置程序入口
if __name__ == '__main__':
main()