初学selenium,尝试着爬取网易云.
import os
from selenium import webdriver
class yun_spider:
def __init__(self):
self.url = 'https://music.163.com/#/discover/playlist/'
self.driver = webdriver.Chrome()
os.chdir('D:/test')
def request_url1_list(self, url):
url_list = []
self.driver.get(url)
self.driver.switch_to.frame('contentFrame')
url_element_list = self.driver.find_elements_by_xpath('//a[@class="s-fc1 "]')
for url in url_element_list:
url_list.append(url.get_attribute('href'))
return url_list
def main(self):
url1_list = self.request_url1_list(self.url)
for url in url1_list:
self.request_url2_list(url)
self.driver.quit()
def request_url2_list(self, url):
url_list = []
self.driver.get(url)
self.driver.switch_to.frame('contentFrame')
element_list = self.driver.find_elements_by_xpath('//ul[@class="m-cvrlst f-cb"]//p[@class="dec"]/a')
for element in element_list:
url_list.append(element.get_attribute('href'))
for final_url in url_list:
if final_url.startswith('https'):
self.request_url3(final_url)
try:
next_page = self.driver.find_elements_by_xpath('//a[text()="下一页"]')[0].get_attribute('href')
except:
return None
else:
self.request_url2_list(next_page)
def request_url3(self, final_url):
self.driver.get(final_url)
self.driver.switch_to.frame('contentFrame')
file_name = self.driver.find_element_by_xpath('//h2[@class="f-ff2 f-brk"]').text
file_name = file_name.replace('|', '')
element_list = self.driver.find_elements_by_xpath('//div[@class="j-flag"]//b')
with open(file_name[:3] + '.txt', 'w', encoding='utf-8') as f:
for element in element_list:
f.write(element.get_attribute('title') + '\n')
yun = yun_spider()
yun.main()
在测试的过程中遇到了这个问题
element is not attached to the page document
原先的代码为:
element_list = self.driver.find_elements_by_xpath('//ul[@class="m-cvrlst f-cb"]//p[@class="dec"]/a')
for element in element_list:
final_url = element.get_attribute('href')
self.request_url3(final_url)
根据百度得到的结果是
元素失效,此种问题通常是因为元素页面刷新之后,为重新获取元素导致的。
解决的方法为:在页面刷新之后,重新获取一下元素
但尝试几次后仍未能解决,只好用笨办法,一次性将所有的链接加入列表中,遍历列表中的地址来访问…