由于上一章反爬了,没有成果,这次换了个网站——汽车之家,下面我们来爬吧!!
上代码!!
# -*- coding : utf-8 -*-
import re
import requests
import openpyxl
from bs4 import BeautifulSoup
def askUrl(url):
try:
req = requests.get(url) #连接地址
req.encoding = 'gb2312'
return req.text
except:
return ''
def getData(baseUrl):
# 获取价格
bold = re.compile(r'.*target="_self">(.*?)</a>')
# 获取价格
number = re.compile(r'<span class="score-number">(.*)</span>')
datalist = []
for i in range(10):
url = baseUrl + str(i+1)+'.html'
print(url)
html = askUrl(url)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_='list-cont'):
data = []
item = str(item) # 转换成字符串
title = re.findall(bold, item)
data.append(title)
score = re.findall(number, item)
data.append(score)
datalist.append(data)
return datalist
def saveData(data, path):
book =openpyxl.Workbook()
sheet = book.create_sheet("cars")
col =('名称', '评分')
sheet.append(col) # 添加列头
for i in range(150):
row = data[i]
for j in range(2):
if len(row[j]) > 0:
sheet.cell(row=(i +2), column=(j+1), value=row[j][0])
book.save(path) # 保存
#主函数
def main():
print("开始爬取......")
#爬取8-12万汽车地址
baseurl = 'https://car.autohome.com.cn/price/list-8_12-0-0-0-0-0-0-0-0-0-0-0-0-0-0-'
datalist = getData(baseurl)
savapath='cars.xls'
saveData(datalist,savapath)
main()
print("爬取完成")
展示成果!!