本文只供学习,不做他用!
import re
import threading
import requests
import os
import time
import openpyxl
# 爬取全部首页图并存储
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
lst_name=[]
def get_url(n):
url=f'https://mox.moe/l/all,all,all,sortpoint,all,all/{n}.htm'
resp=requests.get(url,headers=headers)
# 动漫缩略图
pic_url1=re.compile('<div class="img_book"[.\s]*style="background:url\((.*?)\)')
pic_url=pic_url1.findall(resp.text)
# print(pic_url)
# 动漫信息
name1=re.compile("<a href='(?P<url>.*?)'>(?P<title>.*?)</a> <br /> \[(?P<author>.*?)\] <br />")
pic_name = name1.findall(resp.text)
# print(pic_name)
# 动漫评分
score1 = re.compile('<p style=".*?"><b>(.*?)</b></p>')
pic_score=score1.findall(resp.text)
# print(pic_score)
lst_name = []
for name in pic_name:
lst_name.append(name[1])
# save_page(pic_url,lst_name)#保存图片
save_xlsx(pic_name,pic_score)#动漫详情写入excel
def save_page(url,name):
for url1,name1 in zip(url,name):
print(url1,name1)
file=os.path.join('动漫封面',name1)
resp=requests.get(url1,headers=headers).content
with open(file+'.jpg','wb') as f:
f.write(resp)
print(name1+'已保存')
def save_xlsx(name,score):
wb=openpyxl.Workbook()
sheet=wb.active
lst=['序号','名称','作者','详情页','评分']
sheet.append(lst)
lst1=[]
for index,name1 in enumerate(name):
lst=[index,name1[1],name1[2],name1[0],score[index]]
# lst1.append(lst)
sheet.append(lst)
wb.save('动漫信息.xlsx')
if __name__ == '__main__':
path=os.path.exists('动漫封面')
os.mkdir('动漫封面') if not path else print('文件夹已建立')
t = threading.BoundedSemaphore(5)
for n in range(1,10):#抓取几页就写几页
r=threading.Thread(target=get_url,args=(n,))
r.start()
if threading.active_count()!=1:
pass
else:
print('全部保存完毕')