#!/usr/bin/env python
# encoding: utf-8
'''
@author: JHC
@license: None
@contact: JHC000abc@gmail.com
@file: start.py
@time: 2022/07/06/ 17:06
@desc:since_id 有时为空,多跑几次就可以了
'''
import requests
import re
from threading import Thread
cookies = {
'SUB': '_2A25Pz5bbDeRhGeFJ7VEV9i_FzjWIHXVsvI8TrDV8PUNbmtAKLW2hkW9Nf2FO4AI2fqiHmH8XvBCKkwDRmTSZsiW3',
}
headers = {
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
}
def down_load_pic(url):
'''
下载器
'''
res = requests.get(url)
name = url.split("/")[-1]
with open("./pic_src/{}".format(name),mode="wb")as fp:
fp.write(res.content)
def get_weibo_pic(since_id):
'''
非首页数据爬取
'''
params = {
'page_id': '1004061646239802',
'ajax_call': '1',
"since_id":"{}-1".format(since_id)
}
response = requests.get('https://weibo.com/p/aj/album/loading', params=params, cookies=cookies, headers=headers)
# print(response.text.replace("\\",""))
src = re.findall('<img class="photo_pict" src="(.*?)"/></a>',response.text.replace("\\",""))
for i in src:
pic_url = "https:"+i
t1 = Thread(target=down_load_pic,args=(pic_url,))
t1.start()
# with open("./weibo.txt","a",encoding="utf-8")as fp:
# fp.write("https:"+i+"\n")
since_id = re.findall('&since_id=(.*?)-1',response.text.replace("\\",""))
print("since_id = ",since_id)
if since_id != []:
get_weibo_pic(since_id[0])
else:
print("已经爬取完成!!!")
def get_first_msg(url):
'''
首页数据爬取
'''
response = requests.get(url,
cookies=cookies)
since_id = re.findall('&since_id=(.*?)-1', response.text.replace("\\", ""))
# print("since_id = ", since_id)
src = re.findall('<img class="photo_pict" src="(.*?)"/></a>', response.text.replace("\\", ""))
for i in src:
pic_url = "https:" + i
# print(pic_url)
t2 = Thread(target=down_load_pic, args=(pic_url,))
t2.start()
return since_id[0]
url = input("请输入要爬取的主页链接:")
since_id = get_first_msg(url)
get_weibo_pic(since_id)
微博主页图片爬取
于 2022-07-11 18:21:29 首次发布