使用selenium做一个小说下载器爬虫

执行的时候是这样的。执行前需安装chrome driver。放置在chrome安装目录,并把安装目录添加到path环境变量中。

下载地址:https://sites.google.com/a/chromium.org/chromedriver/downloads




代码:


#coding=utf-8
from selenium import webdriver
from selenium.webdriver.common import keys
import unittest,time,re
import codecs
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
import webbrowser

# 使用unitest下TestCase类来作为框架
class PythonOrgSearch(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Chrome()
        self.url = 'http://www.xs222.com'
        self.book_name = ''
        self.txt_path = 'D:\\txt\\'

    def ownload_book_name(self):
	'''从主页面下载分类信息'''
        driver = self.driver
        driver.get(self.url)
        content = driver.page_source
        # print(content)
        patt = re.compile(r'<li><a href="(.*?)">(.*?)</a></li>').findall(content)
        # print(patt)
        return patt

    def tearDown(self):
        self.driver.close()

    def teestui(self):
	'''显示所有书籍分类'''
        url1 = '1'
        book_frist = {}
        j = 0
        for i in self.ownload_book_name():
            book_frist[j] = [i[0], i[1]]
            j+=1
        # print(book_frist)
        i = 1
        while i < len(book_frist)-1:
            print(str(i)+':'+book_frist[i][1])
            i+=1
        print('请选择需要下载书籍种类:')
        choice = input('请输入:')

        url1 = self.url + book_frist[int(choice)][0]
        print(url1)
        return url1

    def _main(self):
	'''打印所有分类下的书籍名称'''
        url = self.teestui()
        self.driver.get(url)
        content = self.driver.page_source
        # print(content)
        patt = re.compile(r'<li><span class="s2">《<a href="(.*?)" target="_blank">(.*?)</a>》').findall(content)
        print('--------------------------------------------------------------------------------------------------------------------')
        # print(patt)
        book = {}
        j = 1
        for i in patt:
            book[j] = [i[0],i[1]]
            j += 1
        i = 1
        print(len(book))
        while i < len(book)+1:
            print(str(i)+':'+book[i][1])
            i += 1


        # print(book)
        book_num = input('请选择需要下载的书籍编号:')
        self.book_name = book[int(book_num)][1]
        print(self.book_name)


        book_url = self.url + book[int(book_num)][0]
        # print(book_url)
        return book_url

    def _down_book(self):
	'''获取所选书籍的名称和章节URL'''
        book_url = self._main()
        self.driver.get(book_url)
        content = self.driver.page_source
        # print(content)
        ptt = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>').findall(content)
        print('---------------------------------------------------------------------------------------------------------------------------------------------')
        # print(ptt)

        book_frist = []
        book_word = []
        for i in ptt:
            book_frist.append(self.url + i[0])
            book_word.append(i[1])
        return book_word, book_frist

    def testmain_(self):
	'''下载并保存'''
        a = self._down_book()
        book_name = r'%s%s.txt' % (self.txt_path, self.book_name)
        print('' + book_name)
        f = codecs.open(book_name, 'a', 'utf-8') # 出现‘\xa0’保存错误的时候要用codecs.open()
        book_word = a[0]
        book_frist = a[1]
        # print(book_frist)
        # print(book_word)
        i = 0
        while i < len(book_word):
            self.driver.get(book_frist[i])
            content = self.driver.page_source
            pattem = re.compile(r'<div id="content">(.*?)</div>').findall(content)
            # print(pattem)
            word = pattem[0].replace(r'\xa0', ' ')
            word = word.replace(r'<br />', '\n')
            print(book_word[i])
            # print('\n\n')
            # print(word)
            f.write(book_word[i])
            f.write('\n\n')
            f.write(word)
            f.write('\n\n')
            i += 1
        f.close()

if __name__ == '__main__':
    unittest.main()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值