- 微博需要先登录,然后将Headers里的cookie取出来
- 需要要获取的用户的uid,即用户连接或发过的图片中里通常出现的那一串数字
import re
from time import sleep
import requests
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings()
class WebManager(object):
def __init__(self):
self.headers = {
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
}
self.session = requests.Session()
self.session.headers = self.headers
self.proxy = {
"https": '123.207.237.42:3128'
}
self.get_page_num_url = 'https://weibo.cn/u/{}?filter=1&page={}'
self.photo_get_url = 'https://weibo.cn/u/{}?filter=1&page={}'
def build_cookie(self, cookie_str):
cookie_list = cookie_str.split(';')
cookie_dict = {}
for item in cookie_list:
cookie_double_item = item.split("=")
if len(cookie_double_item) != 2:
print("Error parse the cookie!")
return
cookie_dict[cookie_double_item[0]] = cookie_double_item[1]
requests.utils.add_dict_to_cookiejar(self.session.cookies, cookie_dict)
def set_user_uid(self, uid):
self.get_page_num_url = self.get_page_num_url.format(uid, 1)
self.photo_get_url = self.photo_get_url.format(uid, "{}")
def get_photo_real_url(self):
html = self.session.get(self.get_page_num_url, proxies=self.proxy, verify=False)
soup = BeautifulSoup(html.content.decode('utf-8'), 'lxml')
result = soup.find(name='input', attrs={"name": "mp", "type": "hidden"})
global page_num
if 'value' in result.attrs:
page_num = int(result.attrs.get('value'))
for i in range(1, page_num + 1):
yield self.photo_get_url.format(i)
sleep(5)
def get_yt(self, url):
print("start-->{}".format(url))
html = self.session.get(url, proxies=self.proxy, verify=False)
result_list = re.compile(pattern='<a href=".+?">.*?</a>').findall(html.content.decode('utf-8'))
for i in result_list:
if '原图' in i:
href = BeautifulSoup(i, 'lxml').a.attrs.get('href')
redirect_url = self.session.get(href, proxies=self.proxy, verify=False, allow_redirects=False).headers[
"Location"]
yield redirect_url
def download_url(self, url, file_name):
resp = self.session.get(url, proxies=self.proxy, verify=False)
with open("./img/{}.jpg".format(file_name), "wb") as f:
f.write(resp.content)
if __name__ == '__main__':
w = WebManager()
w.build_cookie(
"电脑上登录微博成功后,headers里Cookie键对应的值"
)
w.set_user_uid("想要爬取的用户的uid,例如网易云的微博uid为1721030997")
c = 0
for i in w.get_photo_real_url():
for j in w.get_yt(i):
c = c + 1
w.download_url(j, c)
复制代码