通过微信公众号链接爬取内容,并将爬取到的图片上传到七牛云
import re,os
import pymysql
import requests
import datetime
import random
import time
import oss2
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
endpoint = '***'
auth = oss2.Auth('***', '***')
bucket = oss2.Bucket(auth, endpoint, '***')
#获取html
def get_html(url):
html = requests.get(url, headers=headers).text
return html
#获取文章标题
def get_Title(url):
html = get_html(url)
resTitle = '<meta property="og:title" content="(.*?)" />'
title = re.compile(resTitle, re.DOTALL).findall(html)[0]
return title
#获得文章内容
def get_Content(url):
html = get_html(url)
resContent = '<div class="rich_media_content " id="js_content".*?">(.*?)</div>'
# resContent = '<div class="rich_media_content " id="js_content".*?">(.*?)</div>.*?(<script.*?</script>)'
content = re.compile(resContent, re.DOTALL).findall(html)[0]
content = re.sub(r'\n','',content)
content = re.sub(r'\t', '', content)
content = re.sub(r'data-src', 'src', content)
return content
#获得图片地址
def get_Content_urls(content):
resUrl = 'src="(.*?)".*?data-type="(.*?)"'
Urls =re.compile(resUrl,re.DOTALL).findall(content)
return Urls
#下载文件到本地
def upload_wj(imgurl):
response = requests.get(imgurl[0])
pathname = datetime.datetime.now().strftime("%Y%m%d")
randint = random.randint(1000,9999)
path = pathname + str(randint)
path = path+ '.' +imgurl[1]
with open(path, "wb") as fp:
for data in response.iter_content(128):
fp.write(data)
return path
#上传到oss并返回网址
def put_oss(imgurl):
# 下载网络文件返回本地路径
fileContent = upload_wj(imgurl)
# 上传文件到OSS
result = bucket.put_object_from_file(fileContent, fileContent)
# 获取上传后的网址
ret = bucket.sign_url('GET', fileContent, 60 * 60 * 24 * 365 * 5)
os.remove(fileContent) # 删除本地文件
return ret
#替换文章的文件地址
def replace_src(yurl,ossurl,content):
content = re.sub(yurl, ossurl, content)
return content
#检查是否有存储空间
def does_bucket_exist(bucket):
try:
bucket.get_bucket_info()
except oss2.exceptions.NoSuchBucket:
return False
except:
raise
return True
#查看存储空间内容
def echo_oss():
bucket_info = bucket.get_bucket_info()
for object_info in oss2.ObjectIterator(bucket):
print(object_info.key)
def conn_mysql():
url = '***'
username = '***'
password = '***'
dbname = '***'
db=pymysql.connect(url,username,password,dbname)
return db
def run(url):
# db = conn_mysql()
if url=='':
return 'url为空'
title = get_Title(url)
content = get_Content(url)
urls = get_Content_urls(content)
print(urls)
for url in urls:
# print(url[0]+'------'+url[1])
yurl = url[0]
print(yurl)
print('图片存入oss')
ossurl = put_oss(url)
print('存入成功') # 替换content里的src
content = content.replace(yurl, ossurl)
print('替换成功')
print(content)
print('运行结束')
#存入数据库
# sql = 'INSERT INTO ***(title,content,create_time) VALUES (%s,%s,%s)'
# nowtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# data = [title, content, str(nowtime)]
# db.cursor().execute(sql, data)
# db.commit()
# db.close()
然后通过Flask调用写成接口形式即可。