爬sobooks电子书并把电子书的信息记录到mysql中-CSDN博客

本文链接：https://blog.csdn.net/chenrunhua/article/details/84679015

使用python3+selenium+chromeDriver爬sobooks的电子，记录书的信息（书名，作者，描述，出版时间等），转存到自己的百度云盘

环境：安装xmapp，下载chrome对应的chromeDriver.exe放到python安装目录下及安装python连接mysql的插件：pymysql

以下为配置文件：config.json

1.百度云的账号及密码

"baidu": {
"username": "",
"password": ""
}

为百度云的账号和密码，其中账号密码已加密，使用下面的例子可以算出加密后的账号密码的字符串

编码示例（Python）：

```python
>>> import base64
>>> base64.b85encode(b'username').decode()
'b#rBMZeeX@'
```

2.sobooks的提取码

"sobook_code": "2018919"

提取码需要到微信公众号获取，会不定时更新

3.mysql数据

"db": {
"host":"127.0.0.1",
"port":3306,
"user": "root",
"passwd": "",
"db_name":"test"
}

可以使用xmapp安装开启，启动Appache及MySql，填入数据库连接相关信息即可。

4.保存到百度云的文件夹

只做一级目录名字，定制的自己修改

"save_baidu_dir": "book"

{
  "debug": false,
  "baidu": {
    "username": "",
    "password": ""
  },
  "sobook_code": "2018919",
  "db": {
  	"host":"127.0.0.1",
  	"port":3306,
    "user": "root",
    "passwd": "",
    "db_name":"test"
  },
  "save_baidu_dir": ""

}

解析json配置文件：config.py

import argparse
import json
import logging
import sys
from base64 import b85decode
from pathlib import Path

log_format = '%(asctime)s %(name)s[%(module)s] %(levelname)s: %(message)s'
logging.basicConfig(format=log_format, level=logging.INFO)


class Config:
    def __init__(self):
        self.debug = False
        self.baidu = {
            'username': '',
            'password': ''
        }


    @classmethod
    def load(cls, d):
        the_config = Config()

        the_config.debug = d.get('debug', False)

        try:
            the_config.baidu = {
                'username': b85decode(d['baidu']['username']).decode(),
                'password': b85decode(d['baidu']['password']).decode()
            }

            
            the_config.sobookCode = d.get('sobook_code', "")
            the_config.db = {
                'host': d['db']['host'],
                'port': d['db']['port'],
                'user': d['db']['user'],
                'passwd': d['db']['passwd'],
                'dbName':d['db']['db_name'],
            }
            the_config.saveBaiduDir = d.get('save_baidu_dir', "")
        except Exception as e:
            logging.error('获取配置文件出错: ' + repr(e))

        if not (the_config.baidu['username'] and the_config.baidu['password']):
            # 有些页面操作还是有用的, 比如移动焦点到输入框... 滚动页面到登录表单位置等
            # 所以不禁止 browser 的 auto_login 动作了, 但两项都有才自动提交, 否则只进行自动填充动作
            logging.info('用户名/密码未找到, 自动登录功能将不可用.')

        return the_config


def load_config():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', help='config file name')
    args = parser.parse_args()

    config_name = args.config or 'config.json'
    logging.info('使用配置文件 "{}".'.format(config_name))

    config_file = Path(config_name)
    if not config_file.exists():
        config_name = 'config.default.json'
        logging.warning('配置文件不存在, 使用默认配置文件 "{}".'.format(config_name))
        config_file = config_file.parent.joinpath(config_name)

    try:
        # 略坑, Path.resolve() 在 3.5 和 3.6 上表现不一致... 若文件不存在 3.5 直接抛异常, 而 3.6
        # 只有 Path.resolve(strict=True) 才抛, 但 strict 默认为 False.
        # 感觉 3.6 的更合理些...
        config_file = config_file.resolve()
        config_dict = json.loads(config_file.read_text())
    except Exception as e:
        sys.exit('# 错误: 配置文件载入失败: {}'.format(e))

    the_config = Config.load(config_dict)

    return the_config


config = load_config()

数据库文件：mysql.py

记录获取的书的相关信息：书名、作者、书的描述，书icon的url等信息

import pymysql
# id name author booktype preview tag time isbn dbgrade bookself authorself
from config import config
class Mysql(object):
    def __init__(self):
        try:
            self.conn = pymysql.connect(
                host= config.db["host"],
                port= config.db["port"],
                user= config.db["user"],
                passwd= config.db["passwd"],
                db= config.db["dbName"],
                charset='utf8'
            )
        except Exception as e:
            print(e)
            print('连接成功失败')
        else:
            print('连接成功')
            self.cur = self.conn.cursor()

    def create_table(self):
        try:
            sql = 'create table authorbg(author_des_id int not null auto_increment PRIMARY KEY,author varchar(255), author_describe text) DEFAULT CHARSET=utf8 '
            res = self.cur.execute(sql)
            print(res)

            sql = 'create table sobook(book_id int not null auto_increment PRIMARY KEY, name varchar(255),author varchar(255),book_format varchar(255),preview_count varchar(255),tag varchar(255),publish_time varchar(255),dbpf varchar(255),isbn varchar(255),file_name varchar(255), file_size varchar(255),icon_url varchar(255), book_describe text, book_type varchar(255),author_des_id int,FOREIGN KEY (author_des_id) REFERENCES authorbg(author_des_id)) DEFAULT CHARSET=utf8 '

            res = self.cur.execute(sql)
            print(res)
        except Exception as e:
            print("Exception:",e)
        else:
            pass
        finally:
            pass




    def close(self):
        self.cur.close()
        self.conn.close()

    # def add(self,name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,author_des_id):  # 增
    #     describeAuthorId = self.addAuthorBg(author,author_describe)
    #     sql = "INSERT INTO sobook (name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,author_des_id) VALUES (%s, %s,%s, %s, %s,%s,%s, %s, %s, %s,%s,%s,%s,%s)"
    #     val = (name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,str(author_des_id))
    #     res = self.cur.execute(sql,val)
    #     if res:
    #         self.conn.commit()
    #         print("sucess:")  
    #     else:
    #         print("error:")  
    #         self.conn.rollback()
    #     print(res)    
        
    def addObject(self,bookInfo):  # 增
        describeAuthorId = self.addAuthorBg(bookInfo["author"],bookInfo["author_describe"])
        sql = "INSERT INTO sobook (name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,author_des_id) VALUES (%s, %s,%s, %s, %s,%s,%s, %s, %s, %s,%s,%s,%s,%s)"
        val = (bookInfo["name"],bookInfo["author"],bookInfo["book_format"],bookInfo["preview_count"],bookInfo["tag"],bookInfo["publish_time"],bookInfo["dbpf"],bookInfo["isbn"],bookInfo["file_name"],bookInfo["file_size"],bookInfo["icon_url"], bookInfo["book_describe"],bookInfo["book_type"],str(describeAuthorId))
        res = self.cur.execute(sql,val)
        if res:
            self.conn.commit()
            print("sucess:")  
        else:
            print("error:")  
            self.conn.rollback()
        print(res)    
        
    def addAuthorBg(self,author,author_describe):

        preInsertStr = self.dealWithString(author_describe)
        preInsertStr = self.dealWithString(author_describe)

        sql = "select * FROM authorbg WHERE author = %s"
        na = (author)
        res = self.cur.execute(sql,na)
        res = self.cur.fetchall()
        for item in res:
          print(item)
          existStr = self.dealWithString(item[2])
          if existStr == preInsertStr:
             return item[0]

        return self.insertAuthorBg(author,author_describe)

    def dealWithString(self,author_describe):
        insertKey = author_describe.replace(" ","")
        insertKey = insertKey.replace("\n","")
        insertKey = insertKey.replace("\r","")
        return insertKey


    def insertAuthorBg(self,author,author_describe):

        sql = "INSERT INTO authorbg (author, author_describe) VALUES (%s, %s)"
        val = (author, author_describe)
        res = self.cur.execute(sql,val)
        if res:
            describeId =  int(self.conn.insert_id()) #最后插入行的主键ID  
            self.conn.commit()
        else:
            self.conn.rollback()
        print(res)
        return describeId

    def rem(self,name):  # 删
        #sql = 'delete from sobook where id=1'
        sql = "DELETE FROM sobook WHERE name = %s"
        na = (name)
        res = self.cur.execute(sql,na)
        if res:
            self.conn.commit()
        else:
            self.conn.rollback()
        print(res)

    def mod(self):  # 改
        sql = 'update sobook set name="Tom Ding" where id=2'
        res = self.cur.execute(sql)
        if res:
            self.conn.commit()
        else:
            self.conn.rollback()
        print(res)

    def show(self):  # 查
        sql = 'select * from sobook'
        self.cur.execute(sql)
        res = self.cur.fetchall()
        for i in res:
            print(i)

if __name__ == "__main__":
    mysql = Mysql()
    #mysql.create_table()
    # mysql.add("book6","author4","http://www.baidu.com","6","author6 xx xx xxx ","haha")
    # mysql.show()
    # mysql.close()

爬sobook网站，并记录到数据库：sobooks.py

# 导入selenium的浏览器驱动接口
from selenium import webdriver
import  time
import json
import time
import datetime
from mysql import Mysql
from selenium.webdriver.common.action_chains import ActionChains #引入ActionChains鼠标操作类
from selenium.webdriver.common.keys import Keys #引入keys类操作
#import sys
# 要想调用键盘按键操作需要引入keys包
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
# 导入chrome选项
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from config import config
mysql = Mysql()
curPage = "lishizhuanji"
bookInfoStruct = {}
def main():
    
    mysql.create_table()
    chrome_options = Options()
    

    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('log-level=3')
    driver = webdriver.Chrome( options=chrome_options)
    #driver = webdriver.Chrome("chromedriver",0,chrome_options)
    listPage = ["lishizhuanji","xiaoshuowenxue","renwensheke","lizhichenggong","jingjiguanli","xuexijiaoyu","shenghuoshishang","yingwenyuanban"]
    listPageTx = {"lishizhuanji":"历史传记","xiaoshuowenxue":"小说人文","lizhichenggong":"励志成功","xuexijiaoyu":"学习教育","shenghuoshishang":"生活时尚","yingwenyuanban":"英文原版"}
    try:
        for pageUrl in listPage:
          bookInfoStruct["book_type"] = listPageTx[pageUrl]
          requestCurrentPage(driver,"https://sobooks.cc/"+pageUrl)
    except Exception as e:
          print('Got an error ', e)
    else:
        print("sucess...ok..end.....")

    driver.quit()
      
    mysql.close()

##请求一个大类的list
def requestCurrentPage(driver,url):
    for x in range(1,100):
        pageUrl = url
        if x != 1:
          pageUrl = url + "/page/"+ str(x)

        print(" --------------------------------------------------------" )
        print("current url:" + pageUrl)
        print(" --------------------------------------------------------" )
        driver.get(pageUrl)

        time.sleep(10)

        if driver.current_url == "https://sobooks.cc/":
            print(" " + pageUrl + " end page:" + str(x))
            print(" ---------------------page--end-----------------------------------" )
            return

        Webelement  = driver.find_element_by_id("cardslist") #find_element_by_class_name("cardslist")
        if not Webelement:
            print("no cardslist")
            return
        else:
            print("have cardslist")

        normal_window = driver.current_window_handle

        itemList = driver.find_elements_by_css_selector("[class='card col span_1_of_4']")
        index = 0
        for item in itemList:
            index = index + 1
            print("------------ index:" + str(index) + "------------" )
            bookdetailPage(driver,item.find_element_by_xpath(".//h3/a[@href]").get_attribute('href'))
            driver.switch_to.window(normal_window)
            time.sleep(2)
            pass 

        pass


##请求某一本书的具体信息
def bookdetailPage(driver,url):

	
    driver.execute_script('window.open("%s")'%url) #打开一个标签页。
    time.sleep(3)
    driver.switch_to.window(driver.window_handles[1])
    cur_window = driver.current_window_handle
    time.sleep(2)
    print(driver.current_url)
    detailElement = driver.find_element_by_xpath("//div[@class='book-left']")
    if not detailElement:
        print("no detailElement")
        #sys.exit(0)
    else:
        print("have detailElement")
    bookpic = driver.find_element_by_xpath("//div[@class='bookpic']")
    bookInfoStruct["icon_url"] = bookpic.find_element_by_tag_name("img").get_attribute("src")
    #print("book icon url:" + bookpic.find_element_by_tag_name("img").get_attribute("src"))
    bookinfo = driver.find_element_by_xpath("//div[@class='bookinfo']")
    bookInfoList = bookinfo.find_elements_by_tag_name("li")
    #print("bookinfo:\n" + bookinfo.get_attribute("outerHTML"))
    #bookInfo item
    for item in bookInfoList:
        print("------------")
        if "评分：" == item.text:
            grade = item.find_element_by_tag_name("b").get_attribute("class")
            print("" + grade)
            bookInfoStruct["dbpf"] = grade.replace("dbpf","").strip()
        else:
        	bookIArray = item.text.split('：',1)
        	if bookIArray[0] == "书名":
        		bookInfoStruct["name"] = bookIArray[1].strip()
        		pass
        	elif bookIArray[0] == "作者":
        		bookInfoStruct["author"] = bookIArray[1].strip()
        		pass
        	elif bookIArray[0] == "格式":
        		bookInfoStruct["book_format"] = bookIArray[1].strip()
        		pass
        	elif bookIArray[0] == "浏览":
        		bookInfoStruct["preview_count"] = bookIArray[1].strip()
        		pass	
        	elif bookIArray[0] == "标签":
        		bookInfoStruct["tag"] = bookIArray[1].strip()
        		pass
        	elif bookIArray[0] == "时间":
        		bookInfoStruct["publish_time"] = bookIArray[1].strip()
        		pass
        	elif bookIArray[0] == "ISBN":
        		bookInfoStruct["isbn"] = bookIArray[1].strip()
        		pass
 
        	print("" + item.text)

    #book content
    print("------content------")
    contentList = driver.find_elements_by_xpath("//article[@class='article-content']/*") 
    findContentTag = False
    findAuthorTag = False
    content = ""
    for el in contentList:
        #print("el:\n" + el.get_attribute("outerHTML"))
        if (el.text == "内容简介"):
            findContentTag = True
        elif (el.text == "作者简介"):
            #先输出内容简介，后清空填充作者
            bookInfoStruct["book_describe"] = content
            print("\n内容简介:\n"+content)
            content = ""
            findContentTag = False
            findAuthorTag = True
            
        elif (findAuthorTag and el.text == ""):

            print("\n作者简介:\n"+content)
            bookInfoStruct["author_describe"]= content
            content = ""
            findAuthorTag = False
            print("end...")
            break
        elif (findAuthorTag or  findContentTag):
            content = content + "\n" + el.text
    #文件名字及大小
    tabList = driver.find_elements_by_xpath("//table[@class='dltable']/*/*/*") 
    if len(tabList) > 2:
    	extraIArray = tabList[1].text.split('：',1)
    	if len(extraIArray) > 1:
    		bookInfoStruct["file_name"] = extraIArray[1]
    		pass
    	else:
    	 	bookInfoStruct["file_name"] = tabList[1]
    	

    	extraIArray = tabList[2].text.split('：',1)
    	if len(extraIArray) > 1:
    		bookInfoStruct["file_size"] = extraIArray[1]
    		pass
    	else:
    	 	bookInfoStruct["file_size"] = tabList[2]

    print("" + tabList[1].text)
    print("" + tabList[2].text)

    key = getKey(driver)
    jumToBaiDuYun(driver,key)
    driver.switch_to.window(cur_window)
    time.sleep(2)
    driver.close()

def getKey(driver):

    driver.find_element_by_xpath("//input[@class='euc-y-i']").send_keys(config.sobookCode)
    driver.find_element_by_xpath("//input[@class='euc-y-s']").click()
    time.sleep(3)

    
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='e-secret']")) );
    getKeyWord = driver.find_element_by_xpath("//div[@class='e-secret']").text
    keyword = getKeyWord.replace("提取密码：","")

    print("提取码:" + keyword)
    if not keyword.isalnum():
        raise NameError('sobooks 的提取密码已改，到sobooks微信公众号获取并更新配置文件config.json的sobook_code字段') 

    return keyword

def jumToBaiDuYun(driver,key):
    

    url = driver.find_element_by_xpath("//a[contains(text(), '百度网盘')]").get_attribute("href")# click()
    url = url.split('=')[1]
    driver.execute_script('window.open("%s")'%url) 
    time.sleep(2)
    driver.switch_to.window(driver.window_handles[2])
   # print("driver:"+driver.page_source)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@id='jgddmad']"))) ;
        driver.find_element_by_xpath("//input[@id='jgddmad']").send_keys(key)
        driver.find_element_by_xpath("//a[@title='提取文件']").click()
    except Exception as e:
        #不需要提取
        print("不需要提取码")
    else:
        pass
    finally:
        pass

    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='module-share-header']"))) ;
    #print("driver:"+driver.page_source)
    try:
        
        driver.find_element_by_xpath("//div[@class='KPDwCE']")
        onClickDir(driver)
    except Exception as e:
        #只有一个
         print("only have one book !!!!!")
    else:
        pass
    finally:
        pass
    global bookInfoStruct
    save2Account(driver)
    mysql.addObject(bookInfoStruct)
    book_type = bookInfoStruct["book_type"]
    bookInfoStruct = {}
    bookInfoStruct["book_type"] = book_type
    driver.close()
    pass

def onClickDir(driver):
    driver.find_element_by_xpath("//a[@class='filename']").click()
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='FuIxtL']"))) ;
    time.sleep(3)
    ele = driver.find_element_by_xpath("//dd[@class='g-clearfix AuPKyz']").click()
    time.sleep(2)

    pass

def save2Account(driver):

    driver.find_element_by_xpath("//a[@title='保存到网盘']").click()

    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@id='fileTreeDialog']"))) 
    except Exception as e:
        loginBaidu(driver)
    else:
        pass
    finally:
        pass
    time.sleep(5)
    dianlog = driver.find_element_by_xpath("//div[@id='fileTreeDialog']")
    listEl = dianlog.find_elements_by_xpath(".//ul[@class='treeview treeview-root-content treeview-content ']/*")
    for el in listEl:
        #print("element:"+el.get_attribute("outerHTML"))
        fileName = el.find_element_by_xpath(".//span[@class='treeview-txt']").text
        #print("dir name:"+fileName)
        if fileName == config.saveBaiduDir:
            print("find save dir name:"+fileName)
            el.click()
            break

    time.sleep(3)
    dianlog.find_element_by_xpath("//a[@title='确定']").click()
    time.sleep(3)

def loginBaidu(driver):
    driver.find_element_by_xpath("//p[@id='TANGRAM__PSP_10__footerULoginBtn']").click()
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@id='TANGRAM__PSP_10__userName']"))) 
    driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__userName']").send_keys(config.baidu.username)
    time.sleep(1)
    driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__password']").send_keys(config.baidu.password)
    time.sleep(1)
    driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__submit']").click()
    time.sleep(5)

def contains():
    pass

def containVarInString(containVar,stringVar):
    try:
        if isinstance(stringVar, str):
            if stringVar.find(containVar)>=0:
                return True
            else:
                return False
        else:
            return False
    except :
       return False
    return False  
if __name__ == '__main__':
    main()

启动xmapp后，运行 python sobooks.py 即可