# selenium本身自带这种幽灵浏览器/无头浏览器 import time from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from multiprocessing import Pool class TaoBao(object): options = webdriver.FirefoxOptions() options.headless = True def __init__(self): self.start_url = "https://www.taobao.com/" self.list_driver = webdriver.Firefox(firefox_options=self.options) self.detail_driver = webdriver.Firefox(firefox_options=self.options) # self.pool = pool def start(self, url): self.list_driver.get(url) search_input = WebDriverWait(self.list_driver, 15).until( lambda list_driver: list_driver.find_element_by_id('q')) search_input.send_keys('笔记本电脑') self.list_driver.find_element_by_class_name('btn-search').click() # 获取列表页源代码,获取前休眠 # time.sleep(0.5) # self.list_driver.page_source self.parse_list() def get_list(self, url): self.list_driver.get(url) # self.parse_list() def parse_list(self, html=None): # 查找下一页的标签。 url = WebDriverWait(self.list_driver, 15).until( lambda list_driver: list_driver.find_element_by_css_selector('.next > a')) url = url.get_attribute('href') # 当第一页的数据提取完毕,开始进行第二页的访问时,将第二页的请求,放入进程池。 # apply_async()添加异步任务的方法 pool = Pool(1) pool.apply_async(self.get_list, args=(url,), callback=self.parse_list) pool.close() pool.join() print('------') # self.pool.close() # self.pool.join() # 提取付款人数,详情页的连接 # pay_nums = WebDriverWait(self.list_driver, 15).until(lambda list_driver: list_driver.find_elements_by_class_name('deal-cnt')) # # for pay_num in pay_nums: # pay_num = pay_num.text # detail_url = WebDriverWait(self.list_driver, 15).until( # lambda list_driver: list_driver.find_element_by_css_selector('.ctx-box .title .J_ClickStat')) # detail_url = detail_url.get_attribute('href') # # # self.pool.apply_async(self.get_detail, args=(detail_url, ), callback=self.save_mongo) # self.get_detail(detail_url, pay_num) # self.get_list(url) def get_detail(self, detail_url, pay_num): self.detail_driver.get(detail_url) print(pay_num) num_list = [] nums = WebDriverWait(self.detail_driver, 15).until( lambda detail_driver: detail_driver.find_elements_by_class_name('tm-count')) for num in nums: num_list.append(num.text) result_dict = { 'pay_num': pay_num, 'num': num_list } return result_dict def save_mongo(self, data): print('保存数据', data) if __name__ == '__main__': # pool = Pool() # 默认是主进程 taobao = TaoBao() taobao.start(taobao.start_url) # pool.close() # pool.join() # 让主进程等待子进程任务执行完毕以后,主进程再退出。 print('结束')
selenium 淘宝实例
最新推荐文章于 2024-05-16 16:37:49 发布