优书网是一个老白常用的第三方小说点评网站
首先爬取优书网–>书库
通过书库翻页来获得书籍相关信息
def get_url():
url = "http://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page="
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
html = requests.get(url+"1",headers=headers)
html.encoding = "UTF-8"
js_info = xpathnode(html)
js_info = js_info.get('Bookstore')
account_info = js_info.get('total')
pages = math.ceil(float(account_info/20)) #get the upper integer
url = [url+str(i+1) for i in range(pages)] #this is the array of waited crawl url ,just return to another block
return pages,url
def xpathnode(html): #return the structure of json data
tree = etree.HTML(html.text)
node = tree.xpath('//script/text()') #get the account of books
info = node[0][25:-122]
js_info = json.loads(info)
return js_info
def crawl(): #the core
pages,url_combine = get_url()
conn = conn_sql()
create_tab(conn)
cursor = conn.cursor()
flag = 0
for url in url_combine: #page turning
flag = flag+1
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
html = requests.get(url,headers=headers)
html.encoding = "UTF-8"
book_js_info = xpathnode(html)
book_js_info = book_js_info.get('Bookstore')
book_js_info = book_js_info.get('books')
print('rate of progress:'+str(round(flag*100/pages,2))+'%') #rate of progress
for i in range(20):