from selenium import webdriver
import csv
import threading
import time
from lxml import etree
from queue import Queue
class BaiduSpider(object):
def __init__(self):
self.url = 'https://www.baidu.com/'
self.driver = webdriver.Chrome()
self.q = Queue()
def get_first_page(self):
self.driver.get(self.url)
self.driver.find_element_by_name('wd').send_keys('注册页面')
self.driver.find_element_by_id('su').click()
time.sleep(2)
def parse_page(self):
response = self.driver.page_source
html = etree.HTML(response)
hrefs = html.xpath('//div[@class="result c-container "]/h3[@class="t"]/a/@href')
flag = html.xpath('//div[@id="page"]/a[last()]/@class')[0]
return flag, hrefs
def parse_detail_page(self):
while True:
if self.q.qsize():
# 获取队列中的URL
url = self.q.get()
driver = webdriver.Chrome()
driver.get(url)
try:
html = driver.page_source
except:
driver.quit()
else:
driver.quit()
# 保存到本地
self.save_new_url(url)
else:
time.sleep(10)
def get_page_html(self):
# 获取第一页数据
self.get_first_page()
page_num = 1
while True:
# 解析页面数据,判定是否继续爬取,以及获取详情页的URL
flag, urls = self.parse_page()
# 判断是否为末页
if flag != 'n':
print('已爬取百度全部数据')
break
# 将详情页URL加入队列
for url in urls:
self.q.put(url)
# 跳转至下一页
time.sleep(4)
self.driver.find_element_by_xpath('//div[@id="page"]/a[last()]').click()
def get_dtail_html(self):
while True:
if self.q.qsize() != 0:
print(self.q.qsize())
url = self.q.get()
print(url)
else:
time.sleep(5)
def run(self):
# 获取每页URL
c = threading.Thread(target=self.get_page_html)
c.start()
# 解析详情页
t = threading.Thread(target=self.get_dtail_html)
t.start()
if __name__ == '__main__':
zhuce = BaiduSpider()
zhuce.run()
使用selenium获取百度搜索内容
最新推荐文章于 2024-03-07 15:23:12 发布