1.查找网页的接口
1.使用谷歌浏览器,右击检查,点击“Network
”,查找资源路径
第一页的数据:http://sc.chinaz.com/tag_tupian/OuMeiMeiNv.html
第二页的数据: http://sc.chinaz.com/tag_tupian/OuMeiMeiNv_2.html
第三页的数据:http://sc.chinaz.com/tag_tupian/OuMeiMeiNv_3.html
2.找出资源路径的规律
import urllib.request
def create_request(page):
base_url = 'http://sc.chinaz.com/tag_tupian/OuMeiMeiNv'
#第一页
if page == 1:
url = base_url + '.html'
#第一页之后的
else:
url = base_url + '_' + str(page) + '.html'
#定制请求头,绕过网站的反爬手段
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
2.使用正则表达式,定位元素
右击“查看网页源代码”,定位图片路径的位置
import re
def down_load(content):
#获取图片路径和图片名称
pattern = re.compile('<div class="box picblock col3".*?<img src2="(.*?)" alt="(.*?)">',re.S)
a_list = pattern.findall(content)
for a in a_list:
url = a[0]
filename = './meinv/'+a[1] + '.jpg'
urllib.request.urlretrieve(url=url,filename=filename)
3.完整的代码
(先创建一个“meinv”的文件夹,用来存储图片)
# http://sc.chinaz.com/tag_tupian/OuMeiMeiNv.html
# http://sc.chinaz.com/tag_tupian/OuMeiMeiNv_2.html
# http://sc.chinaz.com/tag_tupian/OuMeiMeiNv_3.html
import urllib.request
def create_request(page):
base_url = 'http://sc.chinaz.com/tag_tupian/OuMeiMeiNv'
if page == 1:
url = base_url + '.html'
else:
url = base_url + '_' + str(page) + '.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
import re
def down_load(content):
pattern = re.compile('<div class="box picblock col3".*?<img src2="(.*?)" alt="(.*?)">',re.S)
a_list = pattern.findall(content)
for a in a_list:
url = a[0]
filename = './meinv/'+a[1] + '.jpg'
urllib.request.urlretrieve(url=url,filename=filename)
if __name__ == '__main__':
start_page = int(input('请输入起始页码'))
end_page = int(input('请输入结束页码'))
for page in range(start_page,end_page + 1):
request = create_request(page)
content = get_content(request)
down_load(content)