使用Selenium库实现的网页爬虫,用于爬取"黑马头条"相关的数据。
- 初始化一个Chrome浏览器实例。
- 打开指定的URL(http://jxd.itheima.net/#/login)。
- 登录网站。
- 进入"服务人员注册"页面。
- 获取表格标题和数据。
- 将获取到的数据保存到JSON文件中。
- 运行爬虫程序。
from selenium import webdriver import json import time from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC class JiXinDa: def __init__(self): self.url = "http://jxd.itheima.net/#/login" self.d = webdriver.Chrome() # 查找元素 def element(self,xpath): f = self.d.find_element(By.XPATH,xpath) return f # 等待1秒 def t(self): time.sleep(1) # 登录网站并进入服务人员注册页面 def ltf(self): self.d.get(self.url) self.d.maximize_window() dl = self.element('//*[@id="app"]/div/div/div[2]/div/button') dl.click() dxfw = self.element('//*[@id="app"]/div/div[1]/ul/li[2]/div/i') self.t() webdriver.ActionChains(self.d).move_to_element(dxfw).click(dxfw).perform() fwrz = self.element('//*[@id="app"]/div/div[1]/ul/li[2]/ul/li/ul/li[5]/span') self.t() webdriver.ActionChains(self.d).move_to_element(fwrz).click(fwrz).perform() self.t() inp = self.element('//*[@id="app"]/div/div[2]/div[1]/div/div/div/div/div[3]/div[1]/form/div/div[1]/div/div/div/input') self.t() inp.click() inp.send_keys('黑马头条') self.t() inp2 = self.element('//*[@id="app"]/div/div[2]/div[1]/div/div/div/div/div[3]/div[1]/form/div/div[4]/div/div/div/input') inp2.click() inp2.send_keys('黑马头条') ss = self.element('//*[@id="app"]/div/div[2]/div[1]/div/div/div/div/div[3]/div[1]/form/div/div[5]/div/div/div/button[1]') ss.click() # 获取表格标题和数据 def get_data(self): title_li = [] for i in range(1,9): title = self.element(f'//*[@id="app"]/div/div[2]/div[1]/div/div/div/div/div[3]/div[2]/div[1]/div[2]/table/thead/tr/th[{i}]').text title_li.append(title) cline = [] num = self.d.find_elements(By.XPATH,'//*[@id="app"]/div/div[2]/div[1]/div/div/div/div/div[3]/div[2]/div[1]/div[3]/table/tbody/tr') for i in range(1,len(num)+1): c = self.element(f'//*[@id="app"]/div/div[2]/div[1]/div/div/div/div/div[3]/div[2]/div[1]/div[3]/table/tbody/tr[{i}]').text cline.append(dict(zip(title_li,c.splitlines()))) return cline # 保存数据到JSON文件 def save_data(self,data): try: with open('jixinda.json','a+',encoding='utf-8')as f: f.write(json.dumps(data,ensure_ascii=False,indent=2)) except Exception as e: print(e) return False # 运行爬虫程序 def run(self): self.ltf() num = 1 while True: button = self.element('//*[@id="app"]/div/div[2]/div[1]/div/div/div/div/div[3]/div[2]/div[2]/button[2]') if button.is_enabled(): data = self.get_data() self.save_data(data) print(f'Saving P{num}') print(data) num+=1 button.click() self.t() else: end_data = self.get_data() print(f'Saving P{num}') self.save_data(end_data) print(end_data) self.d.close() break if __name__=='__main__': jxd = JiXinDa() jxd.run()