import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
模拟浏览器登录,url循环
url=['https://book.douban.com/review/best/?start={}'.format(str(page*20)) for page in range(10)]
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Cookie':'gr_user_id=772f0a06-fccf-4a48-9374-e5343ed2d161; _pk_id.100001.3ac3=965d7abe872f0558.1510192734.4.1523183061.1523177813.; douban-fav-remind=1; bid=BHbWlu_Bu4U; ll="118282"; __utmv=30149280.19050; douban-profile-remind=1; ct=y; __utma=30149280.105681601.1502792649.1560498228.1560733632.64; __utmc=30149280; __utmz=30149280.1560733632.64.58.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; ap_v=0,6.0; __utmt_douban=1; __utma=81379588.1928677283.1510192734.1548820755.1560733642.6; __utmc=81379588; __utmz=81379588.1560733642.6.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; viewed="30411849"; __utmb=30149280.18.9.1560733734953; __utmb=81379588.14.10.1560733642'
}
data_list=[]
select()和find_all()的简单运用
for u in url:
html=requests.get(u,headers=headers)
soup=BeautifulSoup(html.text,'lxml')
time.sleep(2)
#书名
book_name=soup.find_all('img')
book=[i.get('title') for i in book_name if i.get('title')!=None]
#读者
reader=[i.text for i in soup.select('.name')]
#读者心得
think=[i.text for i in soup.select('h2')]
#推荐级别
level_1=soup.select('.main-title-rating')
level=[i.get('title') for i in level_1]
#是否有用
useful=[i.text.strip() for i in soup.select('a.action-btn.up')]
useless=[i.text.strip() for i in soup.select('a.action-btn.down')]
#发布时间
time1=[i.text for i in soup.select('.main-meta')]
data=list(zip(book,reader,think,level,useful,useless,time1))
data_list=data_list+data#**********重点**************这里不能用append
df=pd.DataFrame(data_list,columns=['book','reader','think','level','useful','useless','time1'])
df['book_name']="《"+df['book']+"》"
将结果导出到桌面
df.to_csv("C:/Users/hhq/Desktop/douban.csv")