前言
在上一篇文章中已成功爬取某个品牌的所有车型数据,但是只爬取了每种车型的第一页的数据,本篇文章将修复这个缺陷
程序运行结果
网址分析
打开网址,查看宝马i8的图片首页,右键下一页检查元素
发现是个a标签,并且属性为一个网址href="/pic/series-t/2387-1-p2.html"
,访问该网址是第二页的内容,并且图片的布局与第一页相似,再查看最后一页按钮下一页的元素内容
这时的属性值为href="javascript:void(0);"
,再对比其他车型的内容,最后一页下一页按钮的href值都为"javascript:void(0);"
。所以得出结论,可以在原有的基础上不变,对每一种车型的图片首页进行循环爬取,分别将所有车型的所有页的地址存放到一个数组。
前期准备
Python == 3.8.5
BeautifulSoup
PyQuery
tqdm
代码实现
"""
修复2.0版本只爬取第一页的图片
"""
import requests
from bs4 import BeautifulSoup
import os, shutil
from pyquery import PyQuery as pq
from tqdm import tqdm
def make_dir(dir):
if os.path.exists(dir):
shutil.rmtree(dir, ignore_errors=True)
os.makedirs(dir)
# print('Folder successfully created', dir)
def save_img(img_urls, data_dir):
i = 0
for src in img_urls:
img_name = '%d.jpg'%(i)
src = 'http:' + src
content = requests.get(src).content
with open(os.path.join(data_dir, img_name), 'wb') as f:
f.write(content)
f.close()
# print('Successful preservation %s'%(os.path.join(data_dir, img_name)))
i += 1
class Spider:
def __init__(self, url, ori_url, name = ''):
self.ori_url = ori_url
self.data_dir = name + os.path.split(url)[1]
self.url = url
self.soup = BeautifulSoup(requests.get(url=url).text, 'lxml')
make_dir(self.data_dir)
self.img_urls = []
self.car_type_urls = []
print('Name: %s, Original URL: %s, Fetch From %s, Save as: %s' % (name, self.ori_url, self.url, self.data_dir))
def get_car_type_url(self):
print('Start Get Car Type...')
for div in tqdm(self.soup.find_all('div', {'class':'uibox-con carpic-list02'})):
for li in div.contents[0].contents:
obj = {
'name':'',
'sum':'',
'url':'',
'detail_url': None
}
li = li.contents
obj['url'] = self.ori_url + li[0].get('href')
obj['name'] = li[1].contents[0].contents[0].get('title')
self.car_type_urls.append(obj)
pass
def get_car_detail_url(self):
print('Start Get Detail Information...')
for car_type in tqdm(self.car_type_urls):
for div in pq(url=car_type['url'])('.uibox').items():
flag = False
for a in div('.uibox-title a').items():
if a.text() == '车身外观':
car_type['detail_url'] = None if a.attr('href') is None else (self.get_detail_arr(self.ori_url + a.attr('href')))
car_type['sum'] = div('.uibox-title .uibox-title-font12').text()
flag = True
break
if flag:
break
print(self.car_type_urls, len(self.car_type_urls))
def download_img(self):
print('Start Download...')
for car_obj in tqdm(self.car_type_urls):
img_dir = os.path.join(self.data_dir, car_obj['name']+car_obj['sum'])
make_dir(img_dir)
if car_obj['detail_url'] is None:
continue
img_urls = []
for detail_url in car_obj['detail_url']:
for img in BeautifulSoup(requests.get(url=detail_url).text, 'lxml').find_all('img'):
src = str(img.get('src')).replace('480x360_0_q95_c42_', '1024x0_1_q95_')
if src.find('1024x0_1_q95_') == -1:
continue
img_urls.append(src)
save_img(img_urls, img_dir)
def get_detail_arr(self, url):
"""
对图片的第一页进行分析,返回所有页面的url
:param url: 图片第一页的url
:return: urls: 所有页面的地址
"""
urls = []
doc = pq(url)
page = doc('.page')
if page('a'):
urls.append(url)
html = pq(url)
while True:
next_url = html('.page .page-item-next').attr('href')
if next_url == 'javascript:void(0);':
break
else:
urls.append(self.ori_url + next_url)
html = pq(self.ori_url + next_url)
else:
urls.append(url)
# print(urls)
return urls
if __name__ == '__main__':
ori_url = 'https://car.autohome.com.cn'
url = 'https://car.autohome.com.cn/pic/brand-15.html'
name = '宝马3.0_'
s = Spider(url, ori_url, name)
s.get_car_type_url()
s.get_car_detail_url()
s.download_img()
pass