python 模拟浏览器selenium_Python使用Selenium模块模拟浏览器抓取斗鱼直播间信息示例...

本文实例讲述了Python使用Selenium模块模拟浏览器抓取斗鱼直播间信息。分享给大家供大家参考,具体如下:

import time

from multiprocessing import Pool

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup

from pymongo import MongoClient

from pymongo.errors import PyMongoError

# monogdb配置信息

MONGO_HOST = "localhost"

MONGO_DATABASE = "douyu"

MONGO_TABLE = "zhibo"

client = MongoClient(host=MONGO_HOST)

db = client[MONGO_DATABASE]

# PhantomJS 命令行相关配置

# 参见 http://phantomjs.org/api/command-line.html

SERVICE_ARGS = ['--disk-cache=true', '--load-images=false']

# driver = webdriver.Chrome() # 有界面

driver = webdriver.PhantomJS(service_args=SERVICE_ARGS) # 无界面

delay = 10

wait = WebDriverWait(driver, delay)

driver.maximize_window()

def get_total_pages():

url = 'https://www.douyu.com/directory/all'

driver.get(url)

pages = int(driver.find_element_by_css_selector(

'.shark-pager-dot + .shark-pager-item').text)

print("正在获取第1页数据")

room_list = get_rooms_by_beautifulsoup()

save_to_monogodb(room_list)

return pages

# 根据页码获取指定页数据,并将其保存到数据库中

def parse_page(page_num):

print("正在获取第%d页数据" % page_num)

try:

page_num_box = wait.until(

EC.presence_of_element_located(

(By.CSS_SELECTOR, "input.jumptxt")))

go_btn = wait.until(EC.element_to_be_clickable(

(By.CSS_SELECTOR, 'a.shark-pager-submit')))

page_num_box.clear()

page_num_box.send_keys(page_num)

go_btn.click()

# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# time.sleep(0.1)

wait.until(

EC.text_to_be_present_in_element(

(By.CSS_SELECTOR,

'.shark-pager-item.current'),

str(page_num)))

# 对于By.CLASS_NAME invalid selector: Compound class names not permitted

room_list = get_rooms_by_beautifulsoup()

save_to_monogodb(room_list)

except TimeoutException:

print("请求第%d页失败" % page_num)

print("尝试重新获取第%d页" % page_num)

return parse_page(page_num)

# 通过bs4解析数据

def get_rooms_by_beautifulsoup():

'''

通过bs4库解析数据

获取直播间的名称,观看人数,标签,主播名

'''

wait.until(EC.presence_of_element_located(

(By.CSS_SELECTOR, "ul#live-list-contentbox > li")))

html = driver.page_source

soup = BeautifulSoup(html, 'lxml')

rooms = soup.select('ul#live-list-contentbox > li')

for room in rooms:

room_name = room.find(

'h3', attrs={

'class': 'ellipsis'}).get_text(

strip=True)

view_count = room.find('span', class_='dy-num fr').text

tag = room.find('span', class_='tag ellipsis').text

hostname = room.find('span', class_='dy-name ellipsis fl').text

#print("房间名: " + room_name + "\t观看人数: " + view_count + "\t标签: " + tag + "\t主播名: " + hostname)

yield {

'room_name': room_name,

'view_count': view_count,

'tag': tag,

'hostname': hostname,

}

def save_to_monogodb(room_list):

for room in room_list:

try:

db[MONGO_TABLE].insert(room) # insert支持插入多条数据

print("mongodb插入数据成功:", room)

except PyMongoError as e:

print("mongodb插入数据失败:", room, e)

if __name__ == '__main__':

try:

total_pages = get_total_pages()

for page_num in range(2, total_pages + 1):

parse_page(page_num)

except Exception as e:

print("出错了", e)

finally: # 确保 浏览器能正常关闭

print("共有%d页" % total_pages)

driver.close()

更多关于Python相关内容可查看本站专题:《》、《》、《》、《》、《》、《》及《》

希望本文所述对大家Python程序设计有所帮助。

希望与广大网友互动??

点此进行留言吧!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值