目标:
1、练习python抓去页面动态数据
2、掌握selenium.webdriver的find_element_by_xpath方法
3、入口地址https://k.autohome.com.cn/mpv1/
4、目标数据页面如下
效果图
工具
1、Python 3.7.0
2、PyCharm社区版 2019.1
venv环境
requirements.txt
beautifulsoup4==4.8.0
bs4==0.0.1
certifi==2019.6.16
chardet==3.0.4
EasyProcess==0.2.7
idna==2.8
itchat==1.3.10
Jinja2==2.10.1
MarkupSafe==1.1.1
phantomjs-binary==2.1.3
prettytable==0.7.2
pyecharts==1.4.0
pypng==0.0.20
PyQRCode==1.2.1
PyVirtualDisplay==0.2.4
requests==2.22.0
selenium==3.141.0
simplejson==3.16.0
soupsieve==1.9.2
urllib3==1.25.3
代码
getCarHotLevel.py
from selenium import webdriver
from selenium.webdriver.support.expected_conditions import _find_element
import re
import os
import time
from urllib.request import urlopen
import urllib
from bs4 import BeautifulSoup
import sys
# python getCarHotLevel.py https://k.autohome.com.cn/mpv1/ mpv
def writeFile(dirname, filename, content):
fullpathfilename = '{}{}{}'.format(dirname, os.sep, filename)
print(fullpathfilename)
file_open = open(fullpathfilename, 'a+')
# 打开fie_name2路径下的my_write.txt文件,采用写入模式
# 若文件不存在,创建,若存在,清空并写入
file_open.write(content)
# 在文件中写入一个字符串
file_open.close()
def getHtml(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
page1 = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(page1)
html = page.read()
return html
class TextMatch(object):
def __init__(self, locator, regexp):
self.locator = locator
self.regexp = regexp
def __call__(self, driver):
element_text = _find_element(driver, self.locator).text
return re.search(self.regexp, element_text)
downloadlist = []
# url = 'https://k.autohome.com.cn/suva01/'
url = sys.argv[1]
cartype = sys.argv[2]
if len(sys.argv) == 2:
exit(1)
# driver.get(url)
html = getHtml(url)
# print(html)
bsObj = BeautifulSoup(html, 'html.parser')
# print(bsObj)
# t1 = bsObj.find_all(re.compile("a href*"))
# print(t1)
# 将该类型车辆id号添加至carlist
carlist = []
for tag in bsObj.find_all(re.compile(r'^a')):
s1 = re.search(r'^/\d+/$', tag.attrs['href'], re.I | re.U)
if str(s1) != 'None':
g1 = s1.group(0)
g1 = g1.replace('/', '')
if g1 not in carlist:
carlist.append(g1)
# print(g1)
# 获取列表长度
# print(len(carlist))
# 使用webdriver.Chrome 获取当前聊天人数
option = webdriver.ChromeOptions()
option.add_argument('--headless') # 后台运行Chrome
option.add_argument('--no-sandbox')
option.add_argument('--disable-extensions')
option.add_argument('--disable-gpu') # 不调用界面
driver = webdriver.Chrome(options=option)
driver.implicitly_wait(10)
# driver = webdriver.PhantomJS()
i = 0
url = "https://www.autohome.com.cn/2492/#pvareaid=3454477"
driver.get(url)
time.sleep(5)
# driver.get_screenshot_as_file('/tmp/foo_1.png')
for item in carlist:
talkNum = ''
carName = ''
url = "https://www.autohome.com.cn/" + str(item) + "/#pvareaid=3454477"
driver.get(url)
time.sleep(5)
i = i + 1
try:
# driver.get_screenshot_as_file('/tmp/foo' + str(item) + '_1.png')
# print(driver.find_element_by_xpath('//*[@id="hot-chat-room"]/section/section/div/div[2]/p/i').text)
driver.find_element_by_xpath('//*[@id="hot-chat-room"]/section/section/div/div[1]/ul/li/div/span').click()
driver.find_element_by_xpath('//*[@id="hot-chat-room"]/section/section/div/div[2]/p').click()
time.sleep(5)
talkNum = driver.find_element_by_xpath('//*[@id="hot-chat-room"]/section/div/div/div/span[1]/span[1]')
carName = driver.find_element_by_xpath('//*[@id="hot-chat-room"]/section/div/div/div/span[1]/span[2]')
content = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '!' + carName.text + '!' + str(
talkNum.text) + '!' + url + '\n'
print(content)
except Exception as e:
# print(e)
time.sleep(5)
talkNum = driver.find_element_by_xpath('//*[@id="hot-chat-room"]/section/div/div/div/span[1]/span[1]')
carName = driver.find_element_by_xpath('//*[@id="hot-chat-room"]/section/div/div/div/span[1]/span[2]')
content = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '!' + carName.text + '!' + str(
talkNum.text) + '!' + url + '\n'
print(content)
writeFile('/tmp', cartype + '.log', content)
driver.quit()
运行
运行之后会将结果写入到 /tmp/mpv.log文件中
tail -F /tmp/mpv.log
进行查看
另外
也可以获取其他车型的讨论热度
参考
python getCarHotLevel.py https://k.autohome.com.cn/mpv1/ mpv
现在要得到suv的热度,则运行
python getCarHotLevel.py https://k.autohome.com.cn/suvc1/ suvc1
同样的,会在/tmp路径下生成一个suvc1.log的文件
思考
Q:为什么要用selenium来抓数据,而不用urllib或者其他的?
A:因为selenium可以更好的控制抓取数据的时间,尤其是对于动态数据,本文的例子中,通过测试可以看到目标页面的目标数据展现出来的时间大概在3至5秒左右,通过urllib来抓,可能数据还没出来就进行抓取了,会导致抓不到数据。其次,selenium提供了更灵活的正则匹配,定制化程度可以更高。
Q:这个程序可以部署到其他Linux机器上吗?
A:可以的,经验证,即使非桌面化的Centos,安装好webdriver等软件包后,程序依然可以正常获取数据。
鸣谢
感谢汽车某家提供数据支持,本文内容仅供学习,请勿做任何商业用途。