1、爬取某个微博博主的所有文章
import datetime
import json
import os
import re
import sys
import traceback
import random
from time import sleep
import get_ck
import requests
from lxml import etree
class Weibo():
def __init__(self, user_id, filter=0):
self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400
self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1 代表只爬取用户的原创微博
self.username = '' # 用户名,如“Dear-迪丽热巴”
self.weibo_num = 0 # 用户全部微博数
self.weibo_num2 = 0 # 爬取到的微博数
self.following = 0 # 用户关注数
self.followers = 0 # 用户粉丝数
self.weibo_content = [] # 微博内容
self.publish_time = [] # 微博发布时间
self.publish_place = [] # 微博发布位置
self.up_num = [] # 微博对应的点赞数
self.retweet_num = [] # 微博对应的转发数
self.comment_num = [] # 微博对应的评论数
# 读取cookie文件并解析cookie值
def get_cookie(self):
cookies = '' # 初始化cookie值
with open('cookies.txt', 'r') as f:
cookies_list = json.load(f)
for cookie in cookies_list:
cookies += cookie['name'] + "=" + cookie['value'] + ";"
self.cookie = {
"Cookie": cookies
}
print(self.cookie)
# 获取博主用户名
def get_username(self):
try:
url = "https://weibo.cn/%d/info" % self.user_id # 获取用户基本资料信息比如:https://weibo.cn/3084826290/info(杨昊的基本信息)
html = requests.get(url, cookies=self.cookie).content # 获取用户网页基本信息HTML
selector = etree.HTML(html) # 类似于正则表达式的解析
username = selector.xpath("//title/text()")[0]
# username = selector.xpath("//title/text()")[0] # 以某种规则去切取某部分内容:切取HTML的title文本内容 username=XX的微博
# xpath的作用就是两个字“定位”,运用各种方法进行快速准确的定位
self.username = username
print(u"用户名: " + self.username)
except Exception as e:
print("Error: ", e)
traceback.print_exc() # 打印出错误的内容
# 获取博主微博数、关注数、粉丝数
def get_user_info(self):
try:
url = "https://weibo.cn/u/%d?filter=%d&page=1" % (self.user_id, self.filter)
html = requests.get(url, cookies=self.cookie).content # 获取用户网页基本信息HTML
selector = etree.HTML(html) # 类似于正则表达式的解析
pattern = r"\d+\.?\d*" # \d+表示匹配一个或多个数字字符,比如:’1‘、’34‘、’9999‘ 详情:查看正则表达式
# 微博数(某用户总的微博数)
str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]
# 以某种规则去切取某部分内容,匹配<div class="tip2"><span class="tc">微博[15]</span>
# print(str_wb)#微博[15]
guid = re.findall(pattern, str_wb, re.S | re.M)
for value in guid:
num_wb = int(value)
break
self.weibo_num = num_wb
print(u"微博数: " + str(self.weibo_num))
# 关注数
str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
guid = re.findall(pattern, str_gz, re.M)
self.following = int(guid[0])
print(u"关注数: " + str(self.following))
# 粉丝数
str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
guid = re.findall(pattern, str_fs, re.M)
self.followers = float(guid[0])
print(u"粉丝数: " + str(self.followers))
except Exception as e:
print("Error: ", e)
traceback.print_exc()
# 获取用户微博内容及对应的发布时间、点赞数、转发数、评论数
def get_weibo_info(self):
try:
url = "https://weibo.cn/u/%d?filter=%d&page=1" % (self.user_id, self.filter)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html)
if selector.xpath("//input[@name='mp']") == []:
# 如果没有name='mp' 说明就一页 没有下一页
page_num = 1
else:
page_num = (int)(selector.xpath("//input[@name='mp']")[0].attrib["value"])
# 以上代码为了得出微博有多少页
pattern = r"\d+\.?\d*"
for page in range(1, page_num + 1):
print(u"正在爬取第" + str(page) + "页,请耐心等待..........")
if page % 10 == 0: # 爬取10页睡眠5秒
t = random.randint(6,12)
sleep(t)
print("睡眠" + str(t) + "秒,请稍等.....")
url2 = "https://weibo.cn/u/%d?filter=%d&page=%d" % (self.user_id, self.filter, page)
html2 = requests.get(url2, cookies=self.cookie).content
selector2 = etree.HTML(html2)
info = selector2.xpath("//div[@class='c']")
if len(info) > 3: # 其实每页的微博数量为9个 但是len(info)=12 所以要去掉三个
for i in range(0, len(info) - 2): # 所以要去掉三个
# 微博内容
str_t = info[i].xpath("div/span[@class='ctt']")
weibo_content = str_t[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) # 解码编码
# // *[@id="app"]/div[1]/div[1]/div[8]/div/div/div/article/div[2]/div[1]/a
self.weibo_content.append(weibo_content)
# print(u"微博内容:" + weibo_content)
str_time = info[i].xpath("div/span[@class='ct']")
str_time = str_time[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
# 微博发布工具
if len(str_time.split(u'来自')) > 1:
publish_place = str_time.split(u'来自')[1]
else:
publish_place = u'无'
self.publish_place.append(publish_place)
# print(u"微博发布工具:" + publish_place)
# 微博发布时间
publish_time = str_time.split(u'来自')[0]
if u"刚刚" in publish_time:
publish_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
elif u"分钟" in publish_time:
minute = publish_time[:publish_time.find(u"分钟")]
minute = datetime.timedelta(minutes=int(minute))
publish_time = (datetime.datetime.now() - minute).strftime("%Y-%m-%d %H:%M")
elif u"今天" in publish_time:
today = datetime.datetime.now().strftime("%Y-%m-%d")
time = publish_time[3:]
publish_time = today + " " + time
elif u"月" in publish_time:
year = datetime.datetime.now().strftime("%Y")
month = publish_time[0:2]
day = publish_time[3:5]
time = publish_time[7:12]
publish_time = year + "-" + month + "-" + day + " " + time
else:
publish_time = publish_time[:16]
self.publish_time.append(publish_time)
# print(u"微博发布时间:" + publish_time)
# 点赞数
str_zan = info[i].xpath("div/a/text()")[-4]
guid = re.findall(pattern, str_zan, re.M)
up_num = int(guid[0])
self.up_num.append(up_num)
# print(u"点赞数: " + str(up_num))
# 转发数
retweet = info[i].xpath("div/a/text()")[-3]
guid = re.findall(pattern, retweet, re.M)
retweet_num = int(guid[0])
self.retweet_num.append(retweet_num)
# print(u"转发数: " + str(retweet_num))
# 评论数
comment = info[i].xpath("div/a/text()")[-2]
guid = re.findall(pattern, comment, re.M)
comment_num = int(guid[0])
self.comment_num.append(comment_num)
# print(u"评论数: " + str(comment_num))
self.weibo_num2 += 1 # 原创微博
if not self.filter: # 如果是默认的filter
print(u"共" + str(self.weibo_num2) + u"条微博")
else:
print(u"共" + str(self.weibo_num) + u"条微博,其中" + str(self.weibo_num2) + u"条为原创微博")
except Exception as e:
print("Error: ", e)
traceback.print_exc()
# 将爬取的信息写入文件
def write_txt(self):
try:
if self.filter: # 如果filter=1,表示为原创
result_header = u"\n\n原创微博内容:\n"
else:
result_header = u"\n\n微博内容:\n"
result = (u"用户信息\n用户昵称:" + self.username +
u"\n用户id:" + str(self.user_id) +
u"\n微博数:" + str(self.weibo_num) +
u"\n关注数:" + str(self.following) +
u"\n粉丝数:" + str(self.followers) +
result_header)
for i in range(1, self.weibo_num2 + 1):
text = (str(i) + "\n" +
u"微博内容:" + self.weibo_content[i - 1] + "\n" +
u"发布时间:" + str(self.publish_time[i - 1]) + "\n" +
u"来自:" + str(self.publish_place[i - 1]) + "\n" +
u"点赞数:" + str(self.up_num[i - 1]) + "\n" +
u"转发数:" + str(self.retweet_num[i - 1]) + "\n" +
u"评论数:" + str(self.comment_num[i - 1]) + "\n\n"
)
result = result + text
file_dir = os.path.split(os.path.realpath(__file__))[0] + os.sep + "weibo" # os.path.split():将文件名和路径分割开。
if not os.path.isdir(file_dir):
os.mkdir(file_dir)
file_path = file_dir + os.sep + "北京生态环境1.txt"
f = open(file_path, "wb") # wb" 以二进制写方式打开,只能写文件, 如果文件不存在,创建该文件 如果文件已存在,先清空,再打开文件
f.write(result.encode(sys.stdout.encoding))
f.close()
print(u"微博写入文件完毕,保存路径:" + file_path)
except Exception as e:
print("Error: ", e)
traceback.print_exc()
# 运行爬虫
def start(self):
try:
self.get_cookie()
self.get_username()
self.get_user_info()
print("正在爬取数据请耐心等待............")
self.get_weibo_info()
self.write_txt()
print(u"信息抓取完毕")
except Exception as e:
print("Error: ", e)
def main():
try:
user_id = 你要爬取博主的id # 可以改成任意合法的用户id(爬虫的微博id除外)
filter = 0 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博
wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb
wb.start() # 爬取微博信息
print(u"用户名:" + wb.username)
print(u"全部微博数:" + str(wb.weibo_num))
print(u"关注数:" + str(wb.following))
print(u"粉丝数:" + str(wb.followers))
except Exception as e:
print("Error: ", e)
traceback.print_exc()
if __name__ == "__main__":
get_ck.get_cookies()
sleep(10)
main()
2、扫码登录获取cookie值(get_ck.python)
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
browser_options = Options()
browser = webdriver.Chrome(options=browser_options)
headers = {'user-agent': 'your user agent'}
print("浏览器已成功创建。")
def get_cookies(url='https://passport.weibo.com/sso/signin?entry=wapsso&source=wapsso&url=https%3A%2F%2Fm.weibo.cn%2F'):
url = url
browser.get(url)
print('请在25秒内,使用微博APP扫码登录你的账号...')
time.sleep(25)
# 获取登录后的Cookies
with open('cookies.txt', 'w') as f:
f.write(json.dumps(browser.get_cookies()))
f.close()
print('已成功保存cookie信息。')
3、注意:关于数据一次爬取不全的问题,是因为微博反爬机制导致的。我们可以多次爬取,只需要记住上次爬取结束的页数,下次从上次结束的位置继续爬取即可解决该问题。在代码中修改page和保存文件的文件名即可。