执行的时候是这样的。执行前需安装chrome driver。放置在chrome安装目录,并把安装目录添加到path环境变量中。
下载地址:https://sites.google.com/a/chromium.org/chromedriver/downloads
代码:
#coding=utf-8 from selenium import webdriver from selenium.webdriver.common import keys import unittest,time,re import codecs from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support.ui import Select import webbrowser # 使用unitest下TestCase类来作为框架 class PythonOrgSearch(unittest.TestCase): def setUp(self): self.driver = webdriver.Chrome() self.url = 'http://www.xs222.com' self.book_name = '' self.txt_path = 'D:\\txt\\' def ownload_book_name(self):
'''从主页面下载分类信息''' driver = self.driver driver.get(self.url) content = driver.page_source # print(content) patt = re.compile(r'<li><a href="(.*?)">(.*?)</a></li>').findall(content) # print(patt) return patt def tearDown(self): self.driver.close() def teestui(self):
'''显示所有书籍分类''' url1 = '1' book_frist = {} j = 0 for i in self.ownload_book_name(): book_frist[j] = [i[0], i[1]] j+=1 # print(book_frist) i = 1 while i < len(book_frist)-1: print(str(i)+':'+book_frist[i][1]) i+=1 print('请选择需要下载书籍种类:') choice = input('请输入:') url1 = self.url + book_frist[int(choice)][0] print(url1) return url1 def _main(self):
'''打印所有分类下的书籍名称''' url = self.teestui() self.driver.get(url) content = self.driver.page_source # print(content) patt = re.compile(r'<li><span class="s2">《<a href="(.*?)" target="_blank">(.*?)</a>》').findall(content) print('--------------------------------------------------------------------------------------------------------------------') # print(patt) book = {} j = 1 for i in patt: book[j] = [i[0],i[1]] j += 1 i = 1 print(len(book)) while i < len(book)+1: print(str(i)+':'+book[i][1]) i += 1 # print(book) book_num = input('请选择需要下载的书籍编号:') self.book_name = book[int(book_num)][1] print(self.book_name) book_url = self.url + book[int(book_num)][0] # print(book_url) return book_url def _down_book(self):
'''获取所选书籍的名称和章节URL''' book_url = self._main() self.driver.get(book_url) content = self.driver.page_source # print(content) ptt = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>').findall(content) print('---------------------------------------------------------------------------------------------------------------------------------------------') # print(ptt) book_frist = [] book_word = [] for i in ptt: book_frist.append(self.url + i[0]) book_word.append(i[1]) return book_word, book_frist def testmain_(self):
'''下载并保存''' a = self._down_book() book_name = r'%s%s.txt' % (self.txt_path, self.book_name) print('' + book_name) f = codecs.open(book_name, 'a', 'utf-8') # 出现‘\xa0’保存错误的时候要用codecs.open() book_word = a[0] book_frist = a[1] # print(book_frist) # print(book_word) i = 0 while i < len(book_word): self.driver.get(book_frist[i]) content = self.driver.page_source pattem = re.compile(r'<div id="content">(.*?)</div>').findall(content) # print(pattem) word = pattem[0].replace(r'\xa0', ' ') word = word.replace(r'<br />', '\n') print(book_word[i]) # print('\n\n') # print(word) f.write(book_word[i]) f.write('\n\n') f.write(word) f.write('\n\n') i += 1 f.close() if __name__ == '__main__': unittest.main()