用于技术交流和代码保存
import requests,os
import re,random
import json,time
class DoutuCrawl:
def __init__(self,keyword=None,type='photo',more=1,page=2):
self.locationLink ='https://www.doutula.com/'
self.keyword=keyword
def concatLink(self):
self.Htmlurl = self.locationLink +'search?type=photo&more=1&keyword='+self.keyword+'&page=1'
ResponeHmtl,Maxpage = self.GetHtml(self.Htmlurl)
infoList= []
for i in range(Maxpage):
self.Htmlurl = self.locationLink +'search?type=photo&more=1&keyword='+self.keyword+'&page='+str(i)
infoList.append(self.Htmlurl)
self.GetImage(infoList)
def GetImage(self,infoList):
imglist = []
titleList = []
timelist = []
tmp_list=[]
for i in range(len(infoList)):
try:
imageurl = infoList[i+1]
imAgeHmtl= self.GetHtml(imageurl)
iMg = re.compile('data-original="(.*?)"')
iMgAll=re.findall(iMg,imAgeHmtl[0])
for i in range(len(iMgAll)):
imglist.append(iMgAll[i])
self.savaimg(iMgAll[i])
title = re.compile('<p style="display: none">(.*?)</p>')
tileAll = re.findall(title,imAgeHmtl[0])
for i in range(len(tileAll)):
titleList.append(tileAll[i])
timelist.append(time.strftime("%Y-%m-%d ", time.localtime()) )
except:
print("error")
alllist = zip(imglist,titleList,timelist)
for each,i,n in alllist:
data = {}
data['coverimg'] = each
data['title'] = i
data['updata'] =time.strftime("%Y-%m-%d ", time.localtime())
tmp_list.append(data)
print(tmp_list)
def savaimg(self,imgurl):
path = os.getcwd()
general = path+"\\Enrichment\\" # 总目录
path1=general+imgurl.split("/")[-1]
if not os.path.exists(general):
os.mkdir(general)
writeimag = requests.get(imgurl)
writeimag1= writeimag.content
with open(path1, 'wb') as f:
f.write(writeimag1)
f.close()
def GetHtml(self,Htmlurl):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'Host': 'www.doutula.com',
"Upgrade-Insecure-Requests":'1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
proxies = ["115.218.222.64:9000", "120.194.18.90:81", "123.160.74.11:9999"]
try:
request = requests.session()
response = request.get(Htmlurl, timeout=30, proxies={'http': random.choice(proxies)},headers=headers)
response.encoding = 'utf-8'
# print(response.apparent_encoding)
ResponeHmtl = response.text
pages = re.compile(r'<li class="page-item"><a class="page-link(.*?)</a></li>')
ResponeHmtl = response.text
pages = re.findall(pages,ResponeHmtl)[-1]
Maxpage = int(re.search(r'>(\d+)',pages).group(1) )
return ResponeHmtl,Maxpage
except:
return print("Exceptional error occurred")
keyword = '傻逼'
Doutu = DoutuCrawl(keyword)
Doutu.concatLink()