python 爬虫简单爬取图片

最新推荐文章于 2021-04-30 23:28:13 发布

qq_41714078

最新推荐文章于 2021-04-30 23:28:13 发布

阅读量162

点赞数 1

分类专栏：爬虫文章标签： python 爬虫 tupian

本文链接：https://blog.csdn.net/qq_41714078/article/details/79389918

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import requests
import re
import os
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import random
import urllib.request

def getHtml(url):

# 伪装下请求头

ua_list = [

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"

]

user_agent = random.choice(ua_list)
myheaders = {'user_agent': user_agent}
res = urllib.request.Request(url, headers=myheaders)
response = urllib.request.urlopen(res)
html = response.read()
return html
def getnew_url(html):
urllist = []
soup = BeautifulSoup(html, 'html.parser')
pid = soup.findAll('a', {'class': 'pic'})
for i in pid:
new_url = i.get('href')
urllist.append(new_url)
return urllist
def get_page_index(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求索引页出错！')
return None
def getImg(html):
soup = BeautifulSoup(html, 'lxml')
imglist = soup.findAll('img', attrs={'srch': True})
x = 0
for url in imglist:
imgurl = url.attrs['srch']
# urlretrieve用来保存下载的数据,保存到代码所在的文件夹下
urllib.request.urlretrieve(imgurl, '%s.jpg' % x)
x += 1
if __name__ == '__main__':
html = getHtml('http://desk.zol.com.cn/')
print(getImg(html))
urllist = getnew_url(html)
for url in urllist:
html = get_page_index('http://desk.zol.com.cn' + url)
print(getImg(html))

qq_41714078

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 爬虫简单爬取图片

import requests import reimport osfrom bs4 import BeautifulSoupfrom requests.exceptions import RequestExceptionimport randomimport urllib.requestdef getHtml(url):# 伪装下请求头ua_list = [ "Mozilla/5....
复制链接

扫一扫

专栏目录