斗鱼爬虫

最新推荐文章于 2019-09-05 14:58:14 发布

轩小铭

最新推荐文章于 2019-09-05 14:58:14 发布

阅读量192

点赞数

文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_44785026/article/details/90726789

版权

学习爬虫有一段时间，第一次使用博客，写了个斗鱼爬虫发出来

from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json


class douyuspider:

    # 启动chrome浏览器
    def driver_chrome(self, url):
        driver = webdriver.Chrome()
        driver.get(url)
        return driver

    # 开始爬虫，从直播分类主页获取全部分类的链接
    def start_spider(self):
        url = "https://www.douyu.com/directory"
        driver = self.driver_chrome(url)
        time.sleep(1)
        url_list = []
        soup = BeautifulSoup(driver.page_source, 'lxml')
        for i in soup.find_all(class_='layout-Classify-item'):
            r_url = i.find(class_='layout-Classify-card secondCateCard').get("href")
            r_url = "https://www.douyu.com" + r_url
            url_list.append(r_url)
        url_list2 = []
        for i in url_list:
            if i not in url_list2:
                url_list2.append(i)
        print(url_list2)
        driver.close()
        return url_list2

    # 对每个分类进行爬取
    def spider(self, url):
        driver = self.driver_chrome(url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        next = False
        item = {}
        while not next:
            for i in soup.find_all(class_='layout-Cover-item'):
                item["title"] = i.find(class_='DyListCover-intro').get_text()
                item["room_name"] = i.find(class_='DyListCover-zone').get_text()
                item["anchor"] = i.find(class_='DyListCover-user').get_text()
                # link = 'http://www.yidianzixun.com' + article.get('href')
                self.save(item)
            time.sleep(1)
            # 此处要重新获取当前页面的信息
            soup = BeautifulSoup(driver.page_source, 'lxml')
            # 点击下一页，若只有一页break
            try:
                # 下一页标签有个aria-disabled参数，若有下一页值为false，若无为true
                next = soup.find(title='下一页').get("aria-disabled")
                print(next)
            except:
                break
            else:
                if next == "false":
                    next = False
                else:
                    next = True
                    print(next)
                    # print(item)
                driver.find_element_by_class_name('dy-Pagination-next').click()
        driver.close()

    # 保存数据
    def save(self, item):
        with open("douyu.json", "a") as f:
            json.dump(dict(item), f, ensure_ascii=False, indent=4)
        return item

    # 开始
    def run_spider(self):
        url_list = self.start_spider()
        for url in url_list:
            item = self.spider(url)


if __name__ == '__main__':
    spider = douyuspider()
    spider.run_spider()

轩小铭

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
斗鱼爬虫

学习爬虫有一段时间，第一次使用博客，写了个斗鱼爬虫发出来from selenium import webdriverimport timefrom bs4 import BeautifulSoupimport jsonclass douyuspider: # 启动chrome浏览器 def driver_chrome(self, url): driv...
复制链接

扫一扫