准备
从浏览器中获取自己账号的Cookie、User-Agent、uid,在自己B站主页上按F12,点进network。
在自己py文件同文件夹中创建node.json,将{“weibojiedian”:{“3897000000”: “M_HmUe44444”}},因为代码会打开此文件,没有的话会报错。
此json文件是为了记录此次下载到哪个文章,以防重复下载。
代码将图片链接保存到桌面TXT文件中,可以用下载软件如:IDM等自行下载
本代码有3处需自行输入:
# #python3.7 windows
import requests
from bs4 import BeautifulSoup as bs
import re
import time
from tkinter import *
import winreg,os,time
import json
def get_desktop():
key = winreg.OpenKey(winreg.HKEY_CURRENT_USER,r'Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders',)
return winreg.QueryValueEx(key, "Desktop")[0]
class Wb():
def __init__(self,b):
self.chucu = []
self.chuc = []
self.uidlist = []
self.uidset = set()
self.path1 = r'node.json'
self.path2 = b
def fw(self, url):
headers = {
'User-Agent': '需自己输入',
'Cookie': '需自己输入'}
try:
r = requests.get(url, headers=headers, timeout=5)
time.sleep(2)
if r.status_code == 200:
return r.content.decode('utf-8')
else:
self.chuc.append(url)
return ('chuc')
except:
self.chucu.append(url)
return ('chuc')
def uidget(self):
i = 1
while True:
url = 'https://weibo.cn/需自己输入/follow?page=' + str(i)
res = self.fw(url)
i += 1
if res == 'chuc' or '取消关注' not in res:
# print('break')
break
else:
uide = re.findall('remark.uid=\d+', res)
for line in uide:
uid = re.findall('\d\d\d+', line)[0]
if uid not in self.uidset:
self.uidlist.append(uid)
self.uidset.add(uid)
def chuli_a(self, uid, bi):
i = 1
a = 1
global bj
bj = bi
while True:
page_list = uid + '?page=' + str(i)
i += 1
if '1' in self.stop:
break
else:
pass
r = self.fw(page_list)
if r == 'chuc':
print('出错:' + page_list)
if '转发' not in r:
break
else:
page_div_list = r.split('<div class="c" id="')
for pic, pic_page in enumerate(page_div_list):
find = re.findall('M_.........', pic_page)
if pic == 0:
pass
elif bi in find:
self.stop.append('1')
break
else:
if a == 1:
bj = find[0]
self.chuli_er(pic_page, uid)
a += 1
def chuli_er(self, pic_page, uid):
if '组图共' in pic_page:
find = re.findall('<a href="https://weibo.cn/mblog/picAll/\S+.rl=1">组图共\d张</a>', pic_page)
pic_page_list = re.findall("https://weibo.cn/mblog/picAll/\S+rl=1", find[0])
req = self.fw(pic_page_list[0])
soup_a = bs(req, 'html.parser')
pic_url = soup_a.find_all('img', alt="图片加载中...")
pic_url_list = [img.attrs['src'].replace('thumb180', 'large') for img in pic_url]
self.xr(pic_url_list, uid)
elif '原图' in pic_page:
find = re.findall(
'<img src="http://..\d.sinaimg.cn/wap180/\S+(?:.jpg|.gif|.bmp|.png|.jpeg)" alt="图片" class="ib"',
pic_page)
find_a = find[0]
pic_url_list_a = re.findall('http://..\d.sinaimg.cn/wap180/\S+(?:.jpg|.gif|.bmp|.png|.jpeg)', find_a)
pic_url_list = [pic_url_list_a[0].replace('wap180', 'large')]
self.xr(pic_url_list, uid)
elif 'sinaimg.cn' not in pic_page or 'sinaimg.cn/upload' in pic_page:
pass
else:
# print(pic_page)
print(pic_page)
def xr(self, list, uid):
fi = re.findall('\d+', uid)[0]
with open(self.path2, 'a') as f:
for line in list:
f.write(line + '\n')
def run(self):
global bj
self.uidget()
with open(self.path1, 'r') as f:
load_dict = json.load(f)
dic = load_dict['weibojiedian']
for uid in self.uidlist:
print(uid)
self.stop = []
bi = dic.get(uid, "NA")
print(bi)
uid_li = 'https://weibo.cn/u/' + str(uid)
self.chuli_a(uid_li, bi)
dic[uid] = bj
dic_w = str(dic)
with open(self.path1, 'w+') as f:
json.dump(load_dict, f)
print('page finish')
print(self.chuc)
print(self.chucu)
if __name__ == '__main__':
dspath = get_desktop()
localtime = time.localtime()
time_str = time.strftime("%m-%d", localtime)
name = '微博图片爬取' + time_str + '.txt'
path = os.path.join(dspath, name)
Wb(path).run()