python3爬虫实战:requests库+正则表达式爬取头像
网站url:https://www.woyaogexing.com/touxiang/qinglv/new/
浏览网页:可以发现每个图片都链接到了另一个网页
我们需要获取主目录中的每个图片对应的另一个html页面的url,再从这些url中提取图片
获得要爬取的网页的html
import requests
response = requests.get('https://www.woyaogexing.com/touxiang/qinglv/new/')
response.encoding = 'utf-8'
print(response.text)
我们需要的url在html中的位置如下:
用正则表达式筛选出需要的url
import re
import requests
response = requests.get('https://www.woyaogexing.com/touxiang/qinglv/new/')
response.encoding = 'utf-8'
html = response.text
pattern = re.compile('href="(/touxiang/qinglv/20\d+/\d+\.html)"',re.S)
urls = re.findall(pattern,html)
for url in urls:
print(url)
对其中的每个url在进行一次提取html操作:
import requests
import re
url = '/touxiang/qinglv/2021/1142841.html'
response = requests.get('https://www.woyaogexing.com/'+url)
response.encoding = 'utf-8'
html = response.text
print(html)
我们在这里就可以看见图片的url了
正则表达式筛选:
import requests
import re
url = '/touxiang/qinglv/2021/1142841.html'
response = requests.get('https://www.woyaogexing.com/'+url)
response.encoding = 'utf-8'
html = response.text
pattern = re.compile('href="(//img\d\.woyaogexing\.com/20\d\d.*?\.jpeg)"',re.S)
pic_urls = re.findall(pattern,html)
for pic_url in pic_urls:
print(pic_url)
将图片保存至本地即可
完整代码:
import re
import os
import requests
global i
i = 0
def get_one_page(url):
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
return html
def get_urls(html):
pattern = re.compile('href="(/touxiang/qinglv/20\d+/\d+\.html)"',re.S)
urls = re.findall(pattern,html)
return urls
def get_pic_url(html):
pattern = re.compile('href="(//img\d\.woyaogexing\.com/20\d\d.*?\.jpeg)"',re.S)
pic_urls = re.findall(pattern,html)
return pic_urls
def save_pic(url,pic_path):
global i
if not os.path.exists(pic_path):
os.mkdir(pic_path)
with open(os.path.join(pic_path,str(i)+'.jpg'),'wb') as f:
f.write(requests.get(url).content)
i += 1
def main():
html = get_one_page('https://www.woyaogexing.com/touxiang/qinglv/new/')
urls = get_urls(html)
for url in urls:
sub_html = get_one_page('https://www.woyaogexing.com'+url)
pic_urls = get_pic_url(sub_html)
for pic_url in pic_urls:
save_pic('http:'+pic_url,'D:\\test\\')
if __name__ == '__main__':
main()
效果如下: