import requests
import re
import os
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import random
import urllib.request
def getHtml(url):
# 伪装下请求头
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60","Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
]
myheaders = {'user_agent': user_agent}
res = urllib.request.Request(url, headers=myheaders)
response = urllib.request.urlopen(res)
html = response.read()
return html
def getnew_url(html):
urllist = []
soup = BeautifulSoup(html, 'html.parser')
pid = soup.findAll('a', {'class': 'pic'})
for i in pid:
new_url = i.get('href')
urllist.append(new_url)
return urllist
def get_page_index(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求索引页出错!')
return None
def getImg(html):
soup = BeautifulSoup(html, 'lxml')
imglist = soup.findAll('img', attrs={'srch': True})
x = 0
for url in imglist:
imgurl = url.attrs['srch']
# urlretrieve用来保存下载的数据,保存到代码所在的文件夹下
urllib.request.urlretrieve(imgurl, '%s.jpg' % x)
x += 1
if __name__ == '__main__':
html = getHtml('http://desk.zol.com.cn/')
print(getImg(html))
urllist = getnew_url(html)
for url in urllist:
html = get_page_index('http://desk.zol.com.cn' + url)
print(getImg(html))