学习爬虫有一段时间,第一次使用博客,写了个斗鱼爬虫发出来
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json
class douyuspider:
# 启动chrome浏览器
def driver_chrome(self, url):
driver = webdriver.Chrome()
driver.get(url)
return driver
# 开始爬虫,从直播分类主页获取全部分类的链接
def start_spider(self):
url = "https://www.douyu.com/directory"
driver = self.driver_chrome(url)
time.sleep(1)
url_list = []
soup = BeautifulSoup(driver.page_source, 'lxml')
for i in soup.find_all(class_='layout-Classify-item'):
r_url = i.find(class_='layout-Classify-card secondCateCard').get("href")
r_url = "https://www.douyu.com" + r_url
url_list.append(r_url)
url_list2 = []
for i in url_list:
if i not in url_list2:
url_list2.append(i)
print(url_list2)
driver.close()
return url_list2
# 对每个分类进行爬取
def spider(self, url):
driver = self.driver_chrome(url)
soup = BeautifulSoup(driver.page_source, 'lxml')
next = False
item = {}
while not next:
for i in soup.find_all(class_='layout-Cover-item'):
item["title"] = i.find(class_='DyListCover-intro').get_text()
item["room_name"] = i.find(class_='DyListCover-zone').get_text()
item["anchor"] = i.find(class_='DyListCover-user').get_text()
# link = 'http://www.yidianzixun.com' + article.get('href')
self.save(item)
time.sleep(1)
# 此处要重新获取当前页面的信息
soup = BeautifulSoup(driver.page_source, 'lxml')
# 点击下一页,若只有一页break
try:
# 下一页标签有个aria-disabled参数,若有下一页值为false,若无为true
next = soup.find(title='下一页').get("aria-disabled")
print(next)
except:
break
else:
if next == "false":
next = False
else:
next = True
print(next)
# print(item)
driver.find_element_by_class_name('dy-Pagination-next').click()
driver.close()
# 保存数据
def save(self, item):
with open("douyu.json", "a") as f:
json.dump(dict(item), f, ensure_ascii=False, indent=4)
return item
# 开始
def run_spider(self):
url_list = self.start_spider()
for url in url_list:
item = self.spider(url)
if __name__ == '__main__':
spider = douyuspider()
spider.run_spider()