使用requests爬取豆瓣《长城》影评

       上一篇文章介绍了使用urllib包爬取《长城》豆瓣影评数据,但是无法爬取所有的数据,只能爬到几百条数据就会被服务器禁止访问。也不知道是什么原因。这里要注意,我是用python3中的urllib.request来爬取的,这在python2版本中是urllib2,在python3版本中被拆成了urllib.request和urllib.error.这篇文章介绍使用第三方包requests来解决不能爬取全部数据的问题。可以看到使用requests将会简便的很多。
from bs4 import BeautifulSoup
import re
#import urllib.request, urllib.parse, urllib.error
import time
import numpy as np
import requests
import xlwt
import mysql.connector

#这里的cookie数据是使用浏览器登录豆瓣之后,从浏览器中copy出来的
raw_cookies='viewed="2000732"; bid=hLr_29qhXqQ; gr_user_id=542149e0-529c-40aa-82d9-1ac0f5705308; ll="108296"; ps=y; _vwo_uuid_v2=7D935E4C5EE5A54CD0ED7FD73DB6486E|52676d78d35275c844d98934cdd100e6; ap=1; dbcl2="155577244:PS9mOEvKstQ"; ck=G5IX; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=c67059403c59f1c1.1482632518.7.1482843964.1482763369.; _pk_ses.100001.4cf6=*; __utma=30149280.1695826899.1478691770.1482813867.1482843967.9; __utmb=30149280.0.10.1482843967; __utmc=30149280; __utmz=30149280.1482843967.9.3.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/accounts/login; __utmv=30149280.15557; __utma=223695111.710728627.1482632518.1482763017.1482843967.7; __utmb=223695111.0.10.1482843967; __utmc=223695111; __utmz=223695111.1482843967.7.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/accounts/login'
cookies={}  
for line in raw_cookies.split(';'):  
  key,value=line.split('=',1)#1代表只分一次,得到两个数据  
  cookies[key]=value 
#发送请求表头,爬取所需要的信息
def getInfo(url):
    page_num=0;
    Infolist=[]
    header = dict()
    header['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    header['Accept-Encoding'] = 'gzip, deflate, sdch, br'
    header['Accept-Language'] = 'zh-CN,zh;q=0.8,zh-TW;q=0.6'
    header['Connection'] = 'keep-alive'
    header['Host'] = 'movie.douban.com'
    header['Referer']='https://www.douban.com/accounts/login?source=movie'
    header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
    
    get_url = url  # 利用cookie请求访问另一个网址
    data=requests.get(get_url, timeout =20, headers = header,cookies=cookies).text
    soup = BeautifulSoup(data,'lxml') 
    comments=soup.find_all("div", class_='comment-item')
    for i in comments:
        listinfo=[]
        com=i.find('span',class_='comment-info') 
        listinfo.append(com.contents[1].text.strip())   #用户名
                    #rating
        rating=i.find('span',class_="rating")
        if rating!=None:
            rating=rating.get('title')
        else:
            rating='无评分'
        #print(rating)
        listinfo.append(rating)
       # listinfo.append(com.contents[7].text.strip())   #评论时间
        listinfo.append(i.find('span',class_="comment-time").text.strip())  #评论时间
        listinfo.append(i.find('span',class_="votes pr5").text.strip()) #点赞人数
        listinfo.append(i.find('p').text.strip())  # 相应的评论
        #print(i.find('p').text)
        Infolist.append(listinfo)
    pattern=re.compile(r'a href="(.*?)" data-page="" class="next"')
    S=re.findall(pattern,data)
    page_num+=1
    return Infolist,S
#连接数据库函数
def connDB():
    conn=mysql.connector.connect(user='root',passwd='123456',database='review',charset='utf8mb4')
    cursor=conn.cursor()
    return(conn,cursor)
#断开数据库连接          
def exitConn(conn,cursor):
    cursor.close()
    conn.close()
#保存到数据库
def SaveMysql(datalist):
    conn,cursor=connDB();
    
    cursor.execute('create table movie_review3 \
       (user_id varchar(50) ,\
        rating varchar(20),\
        comment_time varchar(20),\
        comment_vote varchar(50),\
        comment varchar(3000))')  
    for i in range(0,54740): 
        print(i)
        com=''
        data=alllist[i]
        user=re.findall(r'[\u4e00-\u9fa5_a-zA-Z0-9.*]+',data[0])
        if len(user)==0:
            user='用户名'+str(i)
        for i in re.findall(r'[\u4E00-\u9FA5\\]+',data[4]):
            com=com+i+','
        print('insert into movie_review3 values\
         (%s,%s,%s,%s,%s)',[user[0],data[1],data[2],data[3],com])
        cursor.execute('insert into movie_review3 values\
         (%s,%s,%s,%s,%s)',[user[0],data[1],data[2],data[3],com])
        conn.commit()
    exitConn(conn,cursor) 
#保存成excel   
def saveData(alllist,savepath):
    book=xlwt.Workbook(encoding='utf-8',style_compression=0)
    sheet=book.add_sheet(u'豆瓣最受欢迎影评',cell_overwrite_ok=True)
    col=['user_id','rating','comment_time','comment_vote','comment']
    
    for i in range(0,5):
        print(i)
        sheet.write(0,i,col[i])#列名
    
    for i in range(0,54740):#总共50条影评
        com=''
        data=alllist[i]
        user=re.findall(r'[\u4e00-\u9fa5_a-zA-Z0-9.*]+',data[0])
        if len(user)==0:
            user='用户名'+str(i)
        for j in re.findall(r'[\u4E00-\u9FA5\\]+',data[4]):
            com=com+j+','        
        #for j in range(0,5): #此处有问题,评论列插不进去,没找到原因何在
        print("user: ",user)
        print("comment: ",com)
        sheet.write(i+1,0,user[0])#数据
        sheet.write(i+1,1,data[1])#数据
        sheet.write(i+1,2,data[2])#数据
        sheet.write(i+1,3,data[3])#数据
        sheet.write(i+1,4,com)#数据
    """
    for i in range(0,30):#总共50条影评
        data=datalist[i]
        sheet.write(i+1,8,data[8])#数据
    """
    book.save(savepath)#保存       
#保存成txt文件 
def saveTxt(alllist):
    f = open("text.txt",'w')
    for i in range(0,54740):#总共50000条影评
        com=''
        data=alllist[i]
        user=re.findall(r'[\u4e00-\u9fa5_a-zA-Z0-9.*]+',data[0])
        if len(user)==0:
            user='用户名'+str(i)
        for j in re.findall(r'[\u4E00-\u9FA5\\]+',data[4]):
            com=com+j+','
        f.write(user[0])
        f.write('|')
        f.write(data[1])
        f.write('|')
        f.write(data[2])
        f.write('|')
        f.write(data[3])
        f.write('|')
        f.write(com)
        f.write('\n') 
    f.close()
#主函数    
if __name__=='__main__':
      alllist=[]
      url="https://movie.douban.com/subject/6982558/comments?status=P"
      x=0
      while 1:
         x=x+1
         print(x)
         infolist,S=getInfo(url)
         alllist.extend(infolist)
         if (len(S)==0):
             break
         time.sleep(np.random.rand()*5)
         url="https://movie.douban.com/subject/6982558/comments"+re.sub(r'amp;','',S[0]) #翻页url    
             
             
SaveMysql(alllist)        

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值