网站:‘https://www.doutula.com/’
目的:指定要从第几页开始到第几页结束爬取最新表情包图片
代码:
import requests,time,random,re
from bs4 import BeautifulSoup
from lxml import etree
from urllib import request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
file='/Users/qq/desktop/表情包/'
user_agent = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
headers={"User-Agent": random.choice(user_agent)}
class biaoqingbao(object):
url='http://www.doutula.com/photo/list/?page='
def __init__(self,star_page,end_page):
self.star_page=star_page
self.end_page=end_page
def get_one_url(self,page):
now_url=self.url+str(page)
r = requests.get(now_url, headers=headers, verify=False).text
return r
def get_one_data(self,contins):
xml=etree.HTML(contins)
img_url = xml.xpath('//ul/li/div/div/a/img[@referrerpolicy="no-referrer"]/@data-original')
title_list = xml.xpath('//ul/li/div/div/a/p/text()')
return zip(img_url, title_list)
def down_imgs(self,data):
for img,title in data:
print('正在下载:{}'.format(title))
suffix=img[-4:]#图片后缀
if len(title)>20:
request.urlretrieve(img,file+title[20:]+suffix)
else:
request.urlretrieve(img,file+title+suffix)
def run(self):
for page in range(self.star_page,self.end_page+1):
contins=self.get_one_url(page)
print('='*50)
data=self.get_one_data(contins)
self.down_imgs(data)
def main():
star_page=int(input('请输入开始页码:'))
end_page=int(input('请输入结束页码:'))
down=biaoqingbao(star_page,end_page)
down.run()
if __name__ == '__main__':
main()
下面是对代码的一次优化,这次我增加了用户可以自定义保存文件夹名的功能,使用的模块是python标准库os
import requests,random,os
from lxml import etree
from urllib import request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context#设置整个程序都默认关闭ssl认证
#file='/Users/qq/desktop/表情包/'
user_agent = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
headers={"User-Agent": random.choice(user_agent)}
class biaoqingbao(object):
url='http://www.doutula.com/photo/list/?page='
def __init__(self,star_page,end_page,flie_name):
self.star_page=star_page
self.end_page=end_page
self.flie_name=flie_name
def get_one_url(self,page):
now_url=self.url+str(page)
r = requests.get(now_url, headers=headers, verify=False).text
return r
def get_one_data(self,contins):
xml=etree.HTML(contins)
img_url = xml.xpath('//ul/li/div/div/a/img[@referrerpolicy="no-referrer"]/@data-original')
title_list = xml.xpath('//ul/li/div/div/a/p/text()')
return zip(img_url, title_list)
def new_flie(self,name):
‘’‘这个函数的功能就是用户可以自定义保存文件的文件夹名称’‘’
flie='/Users/qq/desktop/{}/'.format(name)
if os.path.exists(flie):#判断保存的文件夹已经存在则打印已经存在字样
print('文件夹存在')
else:#如果存在就新建文件夹
print('新建文件夹')
os.mkdir(flie)
return flie#返回文件夹路径
def down_imgs(self,data,name):
flie=self.new_flie(name)
for img,title in data:
print('正在下载:{}'.format(title))
suffix=img[-4:]#图片后缀
if len(title)>20:
request.urlretrieve(img,flie+title[20:]+suffix)
else:
request.urlretrieve(img,flie+title+suffix)
def run(self):
for page in range(self.star_page,self.end_page+1):
contins=self.get_one_url(page)
print('='*50)
data=self.get_one_data(contins)
self.down_imgs(data,self.flie_name)
def main():
flie_name=input('请输入要保存的文件夹:')
star_page=int(input('请输入开始页码:'))
end_page=int(input('请输入结束页码:'))
down=biaoqingbao(star_page,end_page,flie_name)
down.run()
if __name__ == '__main__':
main()