需求 :爬取一个微信公众号,获取文章里面的图片并把它下载到本地,上传到阿里云,修改原文章的图片地址
import requests
import re
import os
import json
import oss2
from lxml import etree
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64'
}
oss2_ak='XXX'
oss2_sk='XXX'
oss2_endpoint='XXX'
oss2_bucket='XXX'
def zhuaqu():
'''爬取网页内容'''
# url = 'https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247508308&idx=3&sn=60effeb3984b66d3de01f00b226de8ff&chksm=faa10b7fcdd682695d6445f89c7591bcab88300d77f0e62b73e46561ee76ba7455876a837773&cur_album_id=1862926854316163084&scene=189#rd'
# url = 'https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247507973&idx=1&sn=58f00d880e831455e9af194438916da2&chksm=faa10a2ecdd683386c89d2c4e7a45c43288ccb154011b81d836c52bceee7d784fd3215452516&cur_album_id=1862926854316163084&scene=189#rd'
# url ='https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247507973&idx=3&sn=40463d1cd426ce18823031be5f797754&chksm=faa10a2ecdd683383a36e13295fdf3daf2fbf05d54049e0737f108c4dffb4f8eacf96bd1a5a6&cur_album_id=1862926854316163084&scene=189#rd'
url = 'https://mp.weixin.qq.com/s?__biz=MzUzMzQ0NjQ2OA==&mid=2247505777&idx=1&sn=1fc52faec6399ce72692ba28cc16920b&chksm=faa17d5acdd6f44c8e8c9a2ad456f8adf1aa029f0588c5e9cbdd3efcbb1e81a77aa7e26ed0ca&cur_album_id=1862926854316163084&scene=189#rd'
response = requests.get(url=url, headers=headers)
content = response.content.decode('utf8')
return content
def save_html(res):
'''保存网页到本地'''
# 获取title名
detail_text = "test.html"
with open(detail_text, "w", encoding='utf-8') as file:
file.write(res)
def open_html():
'''爬取图片和视频'''
htmlf = open('./test.html', 'r', encoding="utf-8")
htmldetail = htmlf.read()
html_pic = re.findall(r'data-src="(.*?)"', htmldetail)
html_pic_list = []
for item in html_pic:
if 'yiaush' in item:
html_pic_list.append(item)
html_video = re.findall(r"wxv_.{19}", htmldetail)
html_video_list = list(set(html_video))
return html_pic_list, html_video_list
def save_pic(pic_url):
'''保存图片到本地'''
d = 'D:\\pic\\'
new_path = re.findall(r'g/(.*?)/|f/(.*?)/', pic_url)[0]
for i in new_path:
if i:
new_path = i
type = re.findall(r'mmbiz_(.*?)/', pic_url)[0]
path = d + str(new_path) + '.' + type
try:
if not os.path.exists(d):
os.mkdir(d)
if not os.path.exists(path):
r = requests.get(pic_url)
r.raise_for_status()
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("图片保存成功")
else:
print("图片已存在")
except:
print("图片获取失败")
return new_path, type
def save_video(wxv):
'''保存视频到本地'''
video_url_temp = "https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&preview=0&__biz=MzI4MzkzMTc3OA==&mid=2247488495&idx=4&vid=" + wxv
response = requests.get(video_url_temp, headers=headers)
content = response.content.decode()
content = json.loads(content)
url_info = content.get("url_info")
video_url2 = url_info[0].get("url")
d = 'D:\\video\\'
video_path = d + str(wxv) + '.mp4'
try:
if not os.path.exists(d):
os.mkdir(d)
if not os.path.exists(video_path):
html = requests.get(video_url2)
html.raise_for_status()
with open(video_path, 'wb') as f:
f.write(html.content)
f.close()
print("视频保存成功")
else:
print("视频已存在")
except:
print("视频获取失败")
return wxv
def upload_file_to_oss2(oss2_file_name, local_file_name):
"""上传文件到阿里云oss
Args:
oss2_file_name:上传到oss2的文件名称(包含路径)
local_file_name:本地文件名(绝对路径)
"""
auth = oss2.Auth(oss2_ak, oss2_sk)
bucket = oss2.Bucket(auth, oss2_endpoint, oss2_bucket)
resp = bucket.put_object_from_file(oss2_file_name, local_file_name)
if resp.status == 200:
return 'https://' + oss2_bucket + '.' + oss2_endpoint + '/' + oss2_file_name
else:
return
def main():
# 爬取网页内容 并保存到本地
content = zhuaqu()
save_html(content)
# 爬取网页的图片和视频保存到本地
html_pic_list, html_video_list = open_html()
if html_video_list or html_pic_list:
if html_pic_list:
htmlf = open('./test.html', 'r+', encoding="utf-8")
htmldetail = htmlf.read()
for item in html_pic_list:
new_path, type = save_pic(item)
local_path = 'D:\\pic\\' + str(new_path) + '.' + type
oss2_file_name = 'images/' + new_path + '.' + type
url = upload_file_to_oss2(oss2_file_name, local_path)
# 上传到阿里云 更新html
wxm = new_path
data_type = type
if type == 'jpg':
data_type = 'jpeg'
htmldetail = htmldetail.replace('data-src="https://mmbiz.qpic.cn/mmbiz_{}/{}/640?wx_fmt={}"'.format(type, wxm, data_type), 'src={}'.format(url))
save_html(htmldetail)
if html_video_list:
htmlf = open('./test.html', 'r+', encoding="utf-8")
htmldetail = htmlf.read()
for item in html_video_list:
new_path = save_video(item)
local_path = 'D:\\video\\' + str(new_path) + '.mp4'
oss2_file_name = 'images/' + new_path + '.mp4'
url = upload_file_to_oss2(oss2_file_name, local_path)
# 上传到阿里云 更新html
wxm = re.findall("\d+",url)[0]
htmldetail = htmldetail.replace(';vid=wxv_{}"'.format(wxm), ';vid=wxv_{}" src="https://attached-file.oss-cn-shanghai.aliyuncs.com/files/wxv_1935516298600316940.mp4" width=100% height="300px"'.format(wxm))
save_html(htmldetail)
# 保存到数据库
main()