Python 爬取小说点评网站,用大数据方法找小说

优书网是一个老白常用的第三方小说点评网站
首先爬取优书网–>书库
通过书库翻页来获得书籍相关信息

def get_url():
    url = "http://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page="
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
    html = requests.get(url+"1",headers=headers)
    html.encoding = "UTF-8"
    js_info = xpathnode(html)
    
    js_info = js_info.get('Bookstore')
    account_info = js_info.get('total')
    pages = math.ceil(float(account_info/20))  #get the upper integer
    url = [url+str(i+1) for i in range(pages)]    #this is the array of waited crawl url ,just return to another block
    return pages,url

def xpathnode(html):            #return the structure of json data
    tree = etree.HTML(html.text)
    node = tree.xpath('//script/text()')   #get the account of books
    info = node[0][25:-122]
    js_info = json.loads(info)
    return js_info

def crawl():    #the core
    pages,url_combine = get_url()
    conn = conn_sql()
    create_tab(conn)
    cursor = conn.cursor()
    flag = 0
    for url in url_combine:       #page turning
        flag  = flag+1
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
        html = requests.get(url,headers=headers)
        html.encoding = "UTF-8"
        book_js_info = xpathnode(html)
        book_js_info = book_js_info.get('Bookstore')
        book_js_info = book_js_info.get('books')
        print('rate of progress:'+str(round(flag*100/pages,2))+'%')   #rate of progress
        for i in range(20):       #scanning the page
            try:
                book = book_js_info[i]
                dt = {'bookId':book.get('bookId'),'title':book.get('title'),'score':book.get('score'),'scorerCount':book.get('scorerCount'),
                        'author':book.get('author'),'countWord':str(round(book.get('countWord')/10000,2)),\
                        'tags':str(book.get('tags')).translate(str.maketrans('', '', '\'')),\
                        'updateAt':book.get('updateAt')[:10]}
                store_to_sql(dt,conn,cursor)
            except:
                print('erro')
    cursor.close()
    conn.close()

存入SQL server数据库:(使用时可更改自己数据库参数的设置)

def store_to_sql(dt,connect,cursor):    #insert or just change the information
    tbname = '['+record+']'
    ls = [(k,v) for k,v in dt.items() if k is not None]
    sentence = 'IF NOT EXISTS ( SELECT  * FROM '+tbname+' WHERE bookId = '+str(ls[0][1])+') \
                INSERT INTO %s (' % tbname +','.join([i[0] for i in ls]) +') VALUES (' + ','.join(repr(i[1]) for i in ls) + ');'
    cursor.execute((sentence))
    connect.commit()
    return ""

def create_tab(conn):    #create table(if not exists)
    cursor = conn.cursor()
    sentence = 'if not exists (select * from sysobjects where id = object_id('+record+')and OBJECTPROPERTY(id, \'IsUserTable\') = 1) \
    CREATE TABLE\"'+record+'\"\
    (NUM int IDENTITY(1,1),\
    bookId INT NOT NULL,\
    title VARCHAR(100),\
    score VARCHAR(100),\
    scorerCount float,\
    author VARCHAR(50),\
    countWord float ,\
    tags VARCHAR(100),\
    updateAt date) '
    cursor.execute(sentence)
    conn.commit()
    cursor.close()

def conn_sql():
    server = "127.0.0.1"
    user = "sa"
    password = "123456"
    conn = pymssql.connect(server, user, password, "novel")
    return conn

爬取结果:
在这里插入图片描述

注意,优书网更新后限制了爬虫,需要增加一个sleep()人为每次爬取的增加时间间隔。

全部代码:

# -- coding: gbk --
import json
import requests
import csv
import numpy
import json
import math
from lxml import etree
import pymssql 
import datetime

#爬取数据并存入sql数据库
record = str(datetime.date.today()).translate(str.maketrans('','','-'))  #date of today
def get_url():
    url = "http://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page="
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
    html = requests.get(url+"1",headers=headers)
    html.encoding = "UTF-8"
    js_info = xpathnode(html)
    
    js_info = js_info.get('Bookstore')
    account_info = js_info.get('total')
    pages = math.ceil(float(account_info/20))  #get the upper integer
    url = [url+str(i+1) for i in range(pages)]    #this is the array of waited crawl url ,just return to another block
    return pages,url

def xpathnode(html):            #return the structure of json data
    tree = etree.HTML(html.text)
    node = tree.xpath('//script/text()')   #get the account of books
    info = node[0][25:-122]
    js_info = json.loads(info)
    return js_info

def crawl():    #the core
    pages,url_combine = get_url()
    conn = conn_sql()
    create_tab(conn)
    cursor = conn.cursor()
    flag = 0
    for url in url_combine:       #page turning
        flag  = flag+1
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
        html = requests.get(url,headers=headers)
        html.encoding = "UTF-8"
        book_js_info = xpathnode(html)
        book_js_info = book_js_info.get('Bookstore')
        book_js_info = book_js_info.get('books')
        print('rate of progress:'+str(round(flag*100/pages,2))+'%')   #rate of progress
        for i in range(20):       #scanning the page
            try:
                book = book_js_info[i]
                dt = {'bookId':book.get('bookId'),'title':book.get('title'),'score':book.get('score'),'scorerCount':book.get('scorerCount'),
                        'author':book.get('author'),'countWord':str(round(book.get('countWord')/10000,2)),\
                        'tags':str(book.get('tags')).translate(str.maketrans('', '', '\'')),\
                        'updateAt':book.get('updateAt')[:10]}
                store_to_sql(dt,conn,cursor)
            except:
                print('erro')
    cursor.close()
    conn.close()
            
def store_to_sql(dt,connect,cursor):    #insert or just change the information
    tbname = '['+record+']'
    ls = [(k,v) for k,v in dt.items() if k is not None]
    sentence = 'IF NOT EXISTS ( SELECT  * FROM '+tbname+' WHERE bookId = '+str(ls[0][1])+') \
                INSERT INTO %s (' % tbname +','.join([i[0] for i in ls]) +') VALUES (' + ','.join(repr(i[1]) for i in ls) + ');'
    cursor.execute((sentence))
    connect.commit()
    return ""

def create_tab(conn):    #create table(if not exists)
    cursor = conn.cursor()
    sentence = 'if not exists (select * from sysobjects where id = object_id('+record+')and OBJECTPROPERTY(id, \'IsUserTable\') = 1) \
    CREATE TABLE\"'+record+'\"\
    (NUM int IDENTITY(1,1),\
    bookId INT NOT NULL,\
    title VARCHAR(100),\
    score VARCHAR(100),\
    scorerCount float,\
    author VARCHAR(50),\
    countWord float ,\
    tags VARCHAR(100),\
    updateAt date) '
    cursor.execute(sentence)
    conn.commit()
    cursor.close()

def conn_sql():
    server = "127.0.0.1"
    user = "sa"
    password = "123456"
    conn = pymssql.connect(server, user, password, "novel")
    return conn



if __name__ == '__main__':
    crawl()

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页