from bs4 import BeautifulSoup
from lxml import etree
import requests
import time
import os
if __name__=='__main__':
# download_url='https://book.douban.com/top250?start={}'
head={}
head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
'''使用soup.select对html文件进行选择'''
# rank=0
# for j in range(0,250,25):
# download_url = 'https://book.douban.com/top250?start={}'.format(j)
# res=requests.get(url = download_url, headers = head)
# res.encoding='uft-8'
# soup=BeautifulSoup(res.text,'html.parser')
# book_table = soup.select('div.indent')
# for i in range(25):
# rank+=1
# title = soup.select('div.pl2 a')[i].text.strip().split()
# title = '{}{}{}'.format(title[0], title[1], title[2]) if len(title) == 3 else title[0]
# try:
# word=soup.select('span.inq')[i].text.strip()
# except:
# word=None
# publish = soup.select('td p.pl')[i].text.strip()
# score = soup.select('span.rating_nums')[i].text.strip()
# print("{}: {} / {} / {} / {}".format(rank, title, score,publish,word))
'''使用xpath对html文件进行编辑'''
rank=0
for j in range(0, 250, 25):
download_url = 'https://book.douban.com/top250?start={}'.format(j)
res=requests.get(url = download_url, headers = head).text
s=etree.HTML(res)
file=s.xpath('// *[ @ id = "content"] / div / div[1] / div / table')
time.sleep(2)
for div in file:
rank+=1
title_master=div.xpath('./ tr / td[2] / div[1] / a / @title')
try:
title_slave=div.xpath('./ tr / td[2] / div[1] / a / span/text()')
title=title_master[0]+title_slave[0]
except:
title=title_master[0]
score=div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
evale=div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip(')').strip()
try:
word = div.xpath('./tr / td[2] / p[2] / span/text()')[0]
except:
word=None
publish=div.xpath('./ tr / td[2] / p[1]/text()')[0]
# write_book_info={
# 'rank':rank,
# 'title':title,
# 'score':score,
# 'evale':evale,
# 'publish':publish,
# 'word':word
# }
# # print("{}: {} / {} / {} / {} / {}".format(rank, title, score, evale, publish, word))
# print(write_book_info)
with open('book_info.txt','a',encoding='utf-8') as f:
f.write("{}: {} / {}/ {} / {} / {} \n".format(rank, title, score, evale, publish, word))
print('finish saving!')