虎牙直播爬虫项目:
#导入selenium相关的工具
from selenium import webdriver
from lxml import etree
class Huya(object):
#初始化
def __init__ (self):
#通过浏览器加载网页
self.driver = webdriver.PhantomJS()
#要统计的数据
self.room_count = 0 #房间数量
self.audience_count = 0 #房间观众数量
#执行爬虫
def run(self):
#打开网页
self.driver.get('https://www.huya.com/l')
#爬取相关的数据
content = etree.HTML(self.driver.page_source) #获取并解析网页的源码
#获取房间信息
rooms = content.xpath('.//ul[@id="js-live-list"]/li')
for room in rooms:
#获取房间名称
roomname = ''
tmp = room.xpath('./a[contains(@class,"title")]/text()')
if len(tmp) > 0:
roomname = tmp[0]
#获取房间人气
audience = ''
tmp &#