这是一个·用于爬取扇贝单词书的脚本
将在.py文件目录得到一个名为out.txt的输出文件
主要使用了selenium库(webdriver)
使用方式:
更改
13行中指向webdriver驱动器 代码中使用了firefox提供的驱动器
Path = r’C:\Users\pc\Downloads\geckodriver-v0.19.1-win64\geckodriver.exe’
15行中的单词书网页根目录
rootdir=“https://www.shanbay.com/wordbook/6403/”
运行,并且赞美太阳
source code:
# coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
Path = r'C:\Users\pc\Downloads\geckodriver-v0.19.1-win64\geckodriver.exe'
f = open("out.txt", "w")
rootdir=“https://www.shanbay.com/wordbook/6403/”
class ShanbeiWord(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox(executable_path=Path)
self.driver.implicitly_wait(30)
self.verificationErrors = []
self.accept_next_alert = True
def test_shanbei_word(self):
s = " "
driver = self.driver
i = 1
while i<12:
driver.get(
driver.find_element_by_xpath("/html/body/div[3]/div/div[1]/div/div[4]/div[7]/div["+str(i)+"]/div[1]/table/tbody/tr/td[1]/a").click()
i=i+1
j=1
while j<10:
#f.write(driver.page_source)
s = str(s)
s=s+str(driver.page_source)
#f.write(str(i)+"+++"+str(j))
driver.find_element_by_link_text(">").click()
j=j+1
print(str(i) + "+++" + str(j))
#f.write(driver.page_source)
s=str(s)
s = s + str(driver.page_source)
s = str(re.findall(r'g>.*', s, flags=0))
s = str(re.findall(r'>.*?<', s, flags=0))
f.write(s)
def is_element_present(self, how, what):
try:
self.driver.find_element(by=how, value=what)
except NoSuchElementException as e:
return False
return True
def is_alert_present(self):
try:
self.driver.switch_to_alert()
except NoAlertPresentException as e:
return False
return True
def close_alert_and_get_its_text(self):
try:
alert = self.driver.switch_to_alert()
alert_text = alert.text
if self.accept_next_alert:
alert.accept()
else:
alert.dismiss()
return alert_text
finally:
self.accept_next_alert = True
def tearDown(self):
self.driver.quit()
self.assertEqual([], self.verificationErrors)
if __name__ == "__main__":
unittest.main()
赞过:
赞 正在加载……
相关