import os.path
import re
from io import BytesIO
from PIL import Image
import requests
if __name__ == "__main__":
#伪装浏览器标识
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
}
#获取南瓜园论坛时尚街拍html源码
html=requests.get(url='http://www.kankan2008.com/forum-126-1.html',headers=headers).text
#使用正则表达式匹配帖子列表中的链接
pattern='<th class="new">.*? <a href="(thread.*?html).*?</th>'
lst=re.findall(pattern,html,re.S)
#创建girls文件夹
if not os.path.exists('girls'):
os.mkdir('girls')
for str in lst:
url='http://www.kankan2008.com/'+str
#获取帖子对应URL内html数据
html=requests.get(url=url,headers=headers).text
#匹配图片地址
pattern='class="zoom" src="(.*?)" onmouseover'
imgList=re.findall(pattern,html)
#下载图片至girls文件夹
for imgUrl in imgList:
fileName=imgUrl.split('/')[-1].replace('webp','jpg')
imgData=requests.get(url=imgUrl,headers=headers).content
byteStream=BytesIO(imgData)
im=Image.open(byteStream)
im.save('girls/'+fileName,'JPEG')
print(fileName,"下载完成")