上一篇文章介绍了使用urllib包爬取《长城》豆瓣影评数据,但是无法爬取所有的数据,只能爬到几百条数据就会被服务器禁止访问。也不知道是什么原因。这里要注意,我是用python3中的urllib.request来爬取的,这在python2版本中是urllib2,在python3版本中被拆成了urllib.request和urllib.error.这篇文章介绍使用第三方包requests来解决不能爬取全部数据的问题。可以看到使用requests将会简便的很多。
from bs4 import BeautifulSoup
import re
#import urllib.request, urllib.parse, urllib.error
import time
import numpy as np
import requests
import xlwt
import mysql.connector
#这里的cookie数据是使用浏览器登录豆瓣之后,从浏览器中copy出来的
raw_cookies='viewed="2000732"; bid=hLr_29qhXqQ; gr_user_id=542149e0-529c-40aa-82d9-1ac0f5705308; ll="108296"; ps=y; _vwo_uuid_v2=7D935E4C5EE5A54CD0ED7FD73DB6486E|52676d78d35275c844d98934cdd100e6; ap=1; dbcl2="155577244:PS9mOEvKstQ"; ck=G5IX; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=c67059403c59f1c1.1482632518.7.1482843964.1482763369.; _pk_ses.100001.4cf6=*; __utma=30149280.1695826899.1478691770.1482813867.1482843967.9; __utmb=30149280.0.10.1482843967; __utmc=30149280; __utmz=30149280.1482843967.9.3.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/accounts/login; __utmv=30149280.15557; __utma=223695111.710728627.1482632518.1482763017.1482843967.7; __utmb=223695111.0.10.1482843967; __utmc=223695111; __utmz=223695111.1482843967.7.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/accounts/login'
cookies={}
for line in raw_cookies.split(';'):
key,value=line.split('=',1)#1代表只分一次,得到两个数据
cookies[key]=value
#发送请求表头,爬取所需要的信息
def getInfo(url):
page_num=0;
Infolist=[]
header = dict()
header['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
header['Accept-Encoding'] = 'gzip, deflate, sdch, br'
header['Accept-Language'] = 'zh-CN,zh;q=0.8,zh-TW;q=0.6'
header['Connection'] = 'keep-alive'
header['Host'] = 'movie.douban.com'
header['Referer']='https://www.douban.com/accounts/login?source=movie'
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
get_url = url # 利用cookie请求访问另一个网址
data=requests.get(get_url, timeout =20, headers = header,cookies=cookies).text
soup = BeautifulSoup(data,'lxml')
comments=soup.find_all("div", class_='comment-item')
for i in comments:
listinfo=[]
com=i.find('span',class_='comment-info')
listinfo.append(com.contents[1].text.strip()) #用户名
#rating
rating=i.find('span',class_="rating")
if rating!=None:
rating=rating.get('title')
else:
rating='无评分'
#print(rating)
listinfo.append(rating)
# listinfo.append(com.contents[7].text.strip()) #评论时间
listinfo.append(i.find('span',class_="comment-time").text.strip()) #评论时间
listinfo.append(i.find('span',class_="votes pr5").text.strip()) #点赞人数
listinfo.append(i.find('p').text.strip()) # 相应的评论
#print(i.find('p').text)
Infolist.append(listinfo)
pattern=re.compile(r'a href="(.*?)" data-page="" class="next"')
S=re.findall(pattern,data)
page_num+=1
return Infolist,S
#连接数据库函数
def connDB():
conn=mysql.connector.connect(user='root',passwd='123456',database='review',charset='utf8mb4')
cursor=conn.cursor()
return(conn,cursor)
#断开数据库连接
def exitConn(conn,cursor):
cursor.close()
conn.close()
#保存到数据库
def SaveMysql(datalist):
conn,cursor=connDB();
cursor.execute('create table movie_review3 \
(user_id varchar(50) ,\
rating varchar(20),\
comment_time varchar(20),\
comment_vote varchar(50),\
comment varchar(3000))')
for i in range(0,54740):
print(i)
com=''
data=alllist[i]
user=re.findall(r'[\u4e00-\u9fa5_a-zA-Z0-9.*]+',data[0])
if len(user)==0:
user='用户名'+str(i)
for i in re.findall(r'[\u4E00-\u9FA5\\]+',data[4]):
com=com+i+','
print('insert into movie_review3 values\
(%s,%s,%s,%s,%s)',[user[0],data[1],data[2],data[3],com])
cursor.execute('insert into movie_review3 values\
(%s,%s,%s,%s,%s)',[user[0],data[1],data[2],data[3],com])
conn.commit()
exitConn(conn,cursor)
#保存成excel
def saveData(alllist,savepath):
book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet(u'豆瓣最受欢迎影评',cell_overwrite_ok=True)
col=['user_id','rating','comment_time','comment_vote','comment']
for i in range(0,5):
print(i)
sheet.write(0,i,col[i])#列名
for i in range(0,54740):#总共50条影评
com=''
data=alllist[i]
user=re.findall(r'[\u4e00-\u9fa5_a-zA-Z0-9.*]+',data[0])
if len(user)==0:
user='用户名'+str(i)
for j in re.findall(r'[\u4E00-\u9FA5\\]+',data[4]):
com=com+j+','
#for j in range(0,5): #此处有问题,评论列插不进去,没找到原因何在
print("user: ",user)
print("comment: ",com)
sheet.write(i+1,0,user[0])#数据
sheet.write(i+1,1,data[1])#数据
sheet.write(i+1,2,data[2])#数据
sheet.write(i+1,3,data[3])#数据
sheet.write(i+1,4,com)#数据
"""
for i in range(0,30):#总共50条影评
data=datalist[i]
sheet.write(i+1,8,data[8])#数据
"""
book.save(savepath)#保存
#保存成txt文件
def saveTxt(alllist):
f = open("text.txt",'w')
for i in range(0,54740):#总共50000条影评
com=''
data=alllist[i]
user=re.findall(r'[\u4e00-\u9fa5_a-zA-Z0-9.*]+',data[0])
if len(user)==0:
user='用户名'+str(i)
for j in re.findall(r'[\u4E00-\u9FA5\\]+',data[4]):
com=com+j+','
f.write(user[0])
f.write('|')
f.write(data[1])
f.write('|')
f.write(data[2])
f.write('|')
f.write(data[3])
f.write('|')
f.write(com)
f.write('\n')
f.close()
#主函数
if __name__=='__main__':
alllist=[]
url="https://movie.douban.com/subject/6982558/comments?status=P"
x=0
while 1:
x=x+1
print(x)
infolist,S=getInfo(url)
alllist.extend(infolist)
if (len(S)==0):
break
time.sleep(np.random.rand()*5)
url="https://movie.douban.com/subject/6982558/comments"+re.sub(r'amp;','',S[0]) #翻页url
SaveMysql(alllist)