斗鱼爬虫

学习爬虫有一段时间,第一次使用博客,写了个斗鱼爬虫发出来

from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json


class douyuspider:

    # 启动chrome浏览器
    def driver_chrome(self, url):
        driver = webdriver.Chrome()
        driver.get(url)
        return driver

    # 开始爬虫,从直播分类主页获取全部分类的链接
    def start_spider(self):
        url = "https://www.douyu.com/directory"
        driver = self.driver_chrome(url)
        time.sleep(1)
        url_list = []
        soup = BeautifulSoup(driver.page_source, 'lxml')
        for i in soup.find_all(class_='layout-Classify-item'):
            r_url = i.find(class_='layout-Classify-card secondCateCard').get("href")
            r_url = "https://www.douyu.com" + r_url
            url_list.append(r_url)
        url_list2 = []
        for i in url_list:
            if i not in url_list2:
                url_list2.append(i)
        print(url_list2)
        driver.close()
        return url_list2

    # 对每个分类进行爬取
    def spider(self, url):
        driver = self.driver_chrome(url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        next = False
        item = {}
        while not next:
            for i in soup.find_all(class_='layout-Cover-item'):
                item["title"] = i.find(class_='DyListCover-intro').get_text()
                item["room_name"] = i.find(class_='DyListCover-zone').get_text()
                item["anchor"] = i.find(class_='DyListCover-user').get_text()
                # link = 'http://www.yidianzixun.com' + article.get('href')
                self.save(item)
            time.sleep(1)
            # 此处要重新获取当前页面的信息
            soup = BeautifulSoup(driver.page_source, 'lxml')
            # 点击下一页,若只有一页break
            try:
                # 下一页标签有个aria-disabled参数,若有下一页值为false,若无为true
                next = soup.find(title='下一页').get("aria-disabled")
                print(next)
            except:
                break
            else:
                if next == "false":
                    next = False
                else:
                    next = True
                    print(next)
                    # print(item)
                driver.find_element_by_class_name('dy-Pagination-next').click()
        driver.close()

    # 保存数据
    def save(self, item):
        with open("douyu.json", "a") as f:
            json.dump(dict(item), f, ensure_ascii=False, indent=4)
        return item

    # 开始
    def run_spider(self):
        url_list = self.start_spider()
        for url in url_list:
            item = self.spider(url)


if __name__ == '__main__':
    spider = douyuspider()
    spider.run_spider()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值