前言
最近比较关注各类汽车的情况,在车主指南(https://www.icauto.com.cn)看见七月份的汽车前550销量排行,于是便写了爬虫爬取该网页的表格(https://www.icauto.com.cn/rank),在爬取网页之后发现各种链接网址,并一并分网页爬取了各种汽车的情况。
原理步骤
1、抓取主网页,将主网页的表格内容提取出来。
2、提取各种汽车的数据网址。
3、分页抓取各类汽车型号等网页,并将所需要的数据提取出来。
4、将各类数据写入excel,并保存。
代码实现
导入需要的库
import requests
from lxml import etree
import xlwt
import urllib.parse
抓取首页并获取需要的数据
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
url = 'https://www.icauto.com.cn/rank/'#主网页网址
#爬取网页,对总排行的汽车网页信息进行数据分析,并用xpath提取想要的信息和每个车的详细介绍网页
def page_analyze():
res = requests.get(url, headers=headers) #向主机发送请求,获得网页内容
page_text = res.text #将网页进行文本转换
#print(type(res.text))
#print(res.text)
with open('./汽车销量排行.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
with open('./汽车销量排行.html', 'r', encoding='utf-8') as f:
c = f.read()
selector = etree.HTML(c)
data =[]
chexing = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/text()')
#print(chexing)
pingpai = selector.xpath('//table[@class="bordered"]/tr/td[3]/a[1]/text()')
jiage = selector.xpath('//table[@class="bordered"]/tr/td[3]/text()[2]')
xiaoliang1 = selector.xpath('//table[@class="bordered"]/tr/td[4]/text()')
xiaoliang2 = selector.xpath('//table[@class="bordered"]/tr/td[5]/text()')
paihang = selector.xpath('//table[@class="bordered"]/tr/td[1]/text()')
herf = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/@href')
#主网页信息提取
leixing =[]
baozhi =[]
guobei =[]
chechang = []
cheshen =[]
抓取各种汽车子网页并提取汽车类型数据
for m in range(0,550):
new_url = urllib.parse.urljoin(url,herf[m])
print(new_url)
#将获得的子网页网址和url进行拼接,获得需要访问的网址
resp = requests.get(url=new_url, headers=headers)
page_te=resp.text
#发送请求获得子网页内容
with open('./汽车销量排行.html', 'w', encoding='utf-8') as fp:
fp.write(page_te)
with open('./汽车销量排行.html', 'r', encoding='utf-8') as f:
h = f.read()
htm = etree.HTML(h)
guo = htm.xpath('//div[@class="carInfo"]/dd[2]/text()')
guobei.append(guo)
lei = htm.xpath('//div[@class="carInfo"]/dd[3]/text()')
leixing.append(lei)
chang = htm.xpath('//div[@class="carInfo"]/dd[4]/text()')
chechang.append(chang)
bao = htm.xpath('//div[@class="carInfo"]/dd[5]/text()')
baozhi.append(bao)
shen = htm.xpath('//div[@class="carInfo"]/dd[6]/text()')
cheshen.append(shen)
#提取每个子网页我们需要的内容,并将其按类分类
data.append(paihang)
data.append(chexing)
data.append(pingpai)
data.append(jiage)
data.append(xiaoliang1)
data.append(xiaoliang2)
data.append(guobei)
data.append(baozhi)
data.append(cheshen)
data.append(chechang)
data.append(leixing)
将数据写入excel并保存
workbook = xlwt.Workbook('./2022.06汽车销量排行.xlsx')
worksheet = workbook.add_sheet('2022.06汽车销量排行榜')
worksheet.write(0,0,'排名')
worksheet.write(0,1,'车型')
worksheet.write(0,2,'品牌')
worksheet.write(0,3,'指导价格')
worksheet.write(0,4,'月销量')
worksheet.write(0,5,'年销量')
worksheet.write(0,6,'国别')
worksheet.write(0,7,'保值率')
worksheet.write(0,8,'车身结构')
worksheet.write(0,9,'车厂')
worksheet.write(0,10,'类型')
for i in range(0, 11):
#print("第%d条" % (i + 1))
dat = data[i]
for j in range(0, 550):
worksheet.write(j+ 1, i, dat[j])
workbook.save('./2022.06汽车销量排行.xlsx')
完整代码
#调用所需要的库
import requests
from lxml import etree
import xlwt
import urllib.parse
from time import sleep
#车主指南汽车排行网页请求参数
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
url = 'https://www.icauto.com.cn/rank/'#主网页网址
#爬取网页,对总排行的汽车网页信息进行数据分析,并用xpath提取想要的信息和每个车的详细介绍网页
def page_analyze():
res = requests.get(url, headers=headers) #向主机发送请求,获得网页内容
page_text = res.text #将网页进行文本转换
#print(type(res.text))
#print(res.text)
with open('./汽车销量排行.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
with open('./汽车销量排行.html', 'r', encoding='utf-8') as f:
c = f.read()
selector = etree.HTML(c)
data =[]
chexing = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/text()')
#print(chexing)
pingpai = selector.xpath('//table[@class="bordered"]/tr/td[3]/a[1]/text()')
jiage = selector.xpath('//table[@class="bordered"]/tr/td[3]/text()[2]')
xiaoliang1 = selector.xpath('//table[@class="bordered"]/tr/td[4]/text()')
xiaoliang2 = selector.xpath('//table[@class="bordered"]/tr/td[5]/text()')
paihang = selector.xpath('//table[@class="bordered"]/tr/td[1]/text()')
herf = selector.xpath('//table[@class="bordered"]/tr/td[2]/a/@href')
#主网页信息提取
leixing =[]
baozhi =[]
guobei =[]
chechang = []
cheshen =[]
#对各个汽车的网页信息进行分类提取
for m in range(0,550):
new_url = urllib.parse.urljoin(url,herf[m])
print(new_url)
#将获得的子网页网址和url进行拼接,获得需要访问的网址
resp = requests.get(url=new_url, headers=headers)
page_te=resp.text
#发送请求获得子网页内容
with open('./汽车销量排行.html', 'w', encoding='utf-8') as fp:
fp.write(page_te)
with open('./汽车销量排行.html', 'r', encoding='utf-8') as f:
h = f.read()
htm = etree.HTML(h)
guo = htm.xpath('//div[@class="carInfo"]/dd[2]/text()')
guobei.append(guo)
lei = htm.xpath('//div[@class="carInfo"]/dd[3]/text()')
leixing.append(lei)
chang = htm.xpath('//div[@class="carInfo"]/dd[4]/text()')
chechang.append(chang)
bao = htm.xpath('//div[@class="carInfo"]/dd[5]/text()')
baozhi.append(bao)
shen = htm.xpath('//div[@class="carInfo"]/dd[6]/text()')
cheshen.append(shen)
#提取每个子网页我们需要的内容,并将其按类分类
data.append(paihang)
data.append(chexing)
data.append(pingpai)
data.append(jiage)
data.append(xiaoliang1)
data.append(xiaoliang2)
data.append(guobei)
data.append(baozhi)
data.append(cheshen)
data.append(chechang)
data.append(leixing)#将所有内容合并成一大类
#将所获取的列表存入excel,并对表格进行格式调整
workbook = xlwt.Workbook('./2022.06汽车销量排行.xlsx')
worksheet = workbook.add_sheet('2022.06汽车销量排行榜')
worksheet.write(0,0,'排名')
worksheet.write(0,1,'车型')
worksheet.write(0,2,'品牌')
worksheet.write(0,3,'指导价格')
worksheet.write(0,4,'月销量')
worksheet.write(0,5,'年销量')
worksheet.write(0,6,'国别')
worksheet.write(0,7,'保值率')
worksheet.write(0,8,'车身结构')
worksheet.write(0,9,'车厂')
worksheet.write(0,10,'类型')
for i in range(0, 11):
#print("第%d条" % (i + 1))
dat = data[i]
for j in range(0, 550):
worksheet.write(j+ 1, i, dat[j])
workbook.save('./2022.06汽车销量排行.xlsx')
if __name__ == '__main__':
page_analyze()
print('实践小项目圆满完成,再接再励!')
备注
因为博主比较懒,不喜欢码字,本博客就只写具体实现代码,至于具体实现情况就不再缀诉,大家如果要学习,请参照代码自行研究。