Python自动爬取知乎图片
一级目录
使用python自动爬取知乎某一回答下所有图片
代码部分
主函数部分
#用于下载知乎某一问题下的回答中的图片
import requests
from bs4 import BeautifulSoup
import json
import datetime
import os
import re
import ImgLoad
import fileCount
#找到网站的答案的真实地址,并下载其data,然后从data中找到图片的下载地址,并将其下载到本地。
# #其中offset表示从第几条回答开始请求,后面会用来循环。sort表示回答的排列顺序
def download(offset,sort,name,question_url):
#将传入的offset字符串化
offset=str(offset)
#url中设置两个变量,offset和sort
img_name_gather=set() #设置一个存放图片名的列表,检查是否重复下载图片
url=question_url
html=requests.get(url=url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'}).text
res=json.loads(html)
#创建一个文件夹用来存储
if not os.path.exists(name):
os.mkdir(name)
print('文件夹创建成功')
#因为list不能用字符串来索引,所以用enumerate创造一个可以遍历的索引序列
for i,item in enumerate(res['data']):
#找到所有的图片下载地址标签
content=BeautifulSoup(item['content'],'lxml')
imgurls=content.select('noscript img')
#挨个获取图片下载地址和图片的格式,并将其保存到本地文件夹中
for imgurl in imgurls:
try:
src=imgurl['data-original']
except KeyError:
src=imgurl['src']
img=src[src.rfind('.'):]
img_format=img.split("?",1)[0] # print(img_format) 输出文件的格式(jpg|png)
name_compile=re.compile('\w+-\w+')
image_name=name_compile.findall(src)[0]
img_fullName=image_name+img_format
img_name_gather.add(img_fullName)
if os.path.exists(name+'/'+name+'.txt'):
os.remove(name+'/'+name+'.txt')
with open(name+'/'+name+'.txt','a+') as tf: #将下载的图片列表写入到文本中
for img_name in img_name_gather:
tf.write('https://pic1.zhimg.com/50/'+img_name+'\n')
print('待下载图片数量:',len(img_name_gather))
#因为知乎的offset是5,这里设置了一个循环,获得15个回答的所有图片
if __name__=='__main__':
file_name=input("请输入待创建的文件夹名称:")
url=input("请输入url:")
print('开始爬取:',datetime.datetime.now())
""" for i in range(3):
download(offset=(i*5 if i!=0 else 1),sort='default') """
download(offset=1,sort='default',name=file_name,question_url=url)
ImgLoad.img_downLoad(file_name+'/',file_name+'.txt')
print('图片下载完毕',datetime.datetime.now())
fileCount.file_count(file_name,file_name+'.txt')
input('输入任意键退出...')
ImgLoad.py部分
from sys import path
import requests
from bs4 import BeautifulSoup
import json
import time
import uuid
import datetime
import os
import imghdr
import re
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'}
img_url={
'p1':'https://pic1.zhimg.com/50/v2-b796808429375e5e23e1c977862fb327_720w.jpg',
'p2':'https://pic1.zhimg.com/v2-75667c7c2d7b254ce47cc83dc126d637_r.jpg',
'p3':'https://pic3.zhimg.com/50/v2-6beceafb2b427725541e565207ad1bd1_720w.jpg',
'p4':'https://pic1.zhimg.com/50/v2-2c2331f94445ce425983ca0b4b5b179e_720w.jpg',
'p5':'https://pic1.zhimg.com/50/v2-e8c6a6a603ac8f3041cad107b3caa02d_720w.jpg'
}
url={
'f1':'https://www.zhihu.com/api/v4/questions/289081284/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset=&limit=1&sort_by=default&platform=desktop',
}
#name是将要存储的图片的名称 path为文件夹路径 img_name为图片名称,带后缀模式
def get_img(path,img_url,img_name):
img=requests.get(url=img_url,headers=header)
with open(path+img_name,'wb') as f:
f.write(img.content)
count=0
while 1:
if os.path.exists(path+img_name):
print("下载成功:",img_name)
break
else:
count+=1
if count==3:
print('下载失败:',img_name)
break
def get_imgUrl(url):
html=requests.get(url=url,headers=header)
res=json.loads(html)
for i,item in enumerate(res['data']):
#找到所有的图片下载地址标签
content=BeautifulSoup(item['content'],'lxml')
imgurls=content.select('noscript img')
#挨个获取图片下载地址和图片的格式,并将其保存到本地文件夹中
for imgurl in imgurls:
src=imgurl['src']
img=src[src.rfind('.'):]
img=img.split("?",1)[0]
return img
#检查图片是否损坏,若图片损坏无法打开则返回0
def img_check(img_name):
if imghdr.what(img_name):
return 1
else:
return 0
#提取txt文本中的图片url,返回一个img_url列表
def extract_url(path,file_name):
with open(path+file_name) as f:
text=f.read()
img_urls=text.split()
return img_urls
#传入一个path下的txt文件,将文件中的图片名提取出来
def extract_imgName(path,file_name):
with open(path+file_name) as f:
text=f.read()
name_compile=re.compile('\w+-\w+.\w+')
return name_compile.findall(text)
#传入一个txt文本,将path文件夹中所有txt文本中的图片都删除
def img_remove(path,file_name):
img_nameList=extract_imgName(path,file_name)
for img_name in img_nameList:
try:
os.remove(path+img_name)
except OSError:
print('删除错误',img_name)
else:
print('删除成功:',img_name)
#传入一个txt文本,将file_name中的所有图片重新下载
def img_downLoad(path,file_name):
img_urls=extract_url(path,file_name)
url_count=len(img_urls)
count=0
img_name=extract_imgName(path,file_name)
for img_url,img_name in zip(img_urls,img_name):
get_img(path,img_url,img_name)
count+=1
if count%10==0:
print("下载进度:",count/url_count)
#删除下载失败的图片并且重新下载
def img_reload(path,file_name):
img_remove(path,file_name)
img_downLoad(path,file_name)
FileCount.py部分
import os
def file_count(path_fileDir,file_txt_name):
files=os.listdir(path_fileDir)
img_num=len(files)
print('实际下载图片数量:',img_num-1)
with open(path_fileDir+'/'+file_txt_name) as f:
file_url=f.read()
print("应下载图片数量:",len(file_url.split()))