豆瓣电影爬虫实战

基本数据

from lxml import etree
import time
import json
import requests
import csv
import random
import os
from fake_useragent import UserAgent
headers={
            'User-Agent':UserAgent().random
        }
with open('豆瓣链接数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
        writer = csv.writer(fp)
        writer.writerow(["title","rate","url","pic_url"])
#热门 最新 经典 可播放 豆瓣高分 冷门佳片 华语 欧美 韩国 日本 动作 喜剧 爱情 科幻 悬疑 恐怖 成长
if not os.path.exists('./picLibs'):
    os.mkdir('./picLibs')
for i in range(1,2):
    url='https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=100&page_start={}'.format(i)
    response=requests.get(url=url,headers=headers)
    data=response.json()
    
    print("第{}条正在爬取".format(i))
    cards=data["subjects"]
    
    for card in cards:
        li=[]
        rate=card["rate"]#评分
        title=card["title"]#影名
        url=card["url"]#url
        pic_url=card["cover"]#pic_url
        
        img_data = requests.get(url=pic_url,headers=headers).content
        img_path = 'picLibs/'+title+'.jpg'
        with open(img_path,'wb') as fp:
            fp.write(img_data)
            print(title,'下载成功!!!')
        
        li.append(title)
        li.append(rate)
        li.append(url)
        li.append(pic_url)
        with open('豆瓣链接数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
                writer = csv.writer(fp)
                writer.writerow(li)

    time.sleep(float(format(random.uniform(0,3), '.2f')))
         
        

集体数据

from lxml import etree
import time
import json
import requests
import csv
import numpy as np
import pandas as pd
import re
import random
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
headers={
            'User-Agent':UserAgent().random
        }

with open('豆瓣具体数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
        writer = csv.writer(fp)
        writer.writerow(["名字","评分","url","pic_url","导演","编剧","主演","类型","制片国家/地区","语言","上映日期","别名","片长","评价次数","电影简介","短评次数","影评次数"])
data=pd.read_csv("豆瓣链接数据.csv")
for i in range(0,data.index.stop):
    li=data.iloc[i].tolist()
    url=data.iloc[i]["url"]
    response=requests.get(url=url,headers=headers)
    # response.encoding = 'GBK'
    page_text=response.text
    tree=etree.HTML(page_text)
    
    
    div_list=tree.xpath('//*[@id="info"]//text()')
    texts="".join(div_list).replace("\n        ","").replace("\n","")
    if  "编剧" in texts:
        director=texts.split("编剧")[0].split(": ")[1]
        screenwriter=texts.split("主演")[0].split("编剧: ")[1]
    else:
        director=texts.split("主演")[0].split(": ")[1]
        screenwriter="nan"
        
    zldy=list(re.findall(r"主演: (.*?)类型: (.*?)国家/地区: (.*?)语言: (.*?)上",texts)[0])
    time_place=texts.split("上映日期: ")[1].replace(")",") ").split(" ")[0]
    
    if "又名" in texts:
        names1=texts.split("又名: ")[1]
        if "IMDb" in names1:
            names1=names1.split("IMDb")[0]
    else:names1="nan"
    
    if "片长" in texts:
        t=texts.split("片长: ")[1].replace("钟","钟 ").split(" ")[0]
    else:t="nan"
    
    
    li.append(director)
    li.append(screenwriter)
    li.extend(zldy)
    li.append(time_place)
    li.append(names1)
    li.append(t)
    
    comment_count=tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span//text()')[0]
    brief="".join(tree.xpath('//*[@id="link-report"]/span//text()'))
    if len(brief)!=0:
        brief=pq(brief).text()
    else:brief="nan"
    
    com=tree.xpath('//*[@id="comments-section"]/div[1]/h2/span/a/text()')[0]
    com=re.findall(r"全部 (.*?) 条",com)[0]
    
    com1=tree.xpath('//*[@id="reviews-wrapper"]/header/h2/span/a/text()')[0]
    com1=re.findall(r"全部 (.*?) 条",com1)[0]
    
    li.append(comment_count)
    li.append(brief)
    li.append(com)
    li.append(com1)
    
    print(li)
    with open('豆瓣具体数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
            writer = csv.writer(fp)
            writer.writerow(li)
    time.sleep(float(format(random.uniform(0,1), '.2f')))
    if i==5:
       break
   

具体评论

from lxml import etree
import time
import json
import requests
import csv
import numpy as np
import pandas as pd
import re
import random
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
def wordcrowd(text):
    
    import pandas as pd
    import numpy as np
    import numpy
    import os
    from sklearn.feature_extraction.text import TfidfVectorizer
    import jieba
   
    content=[]
    for line in text:
        word=jieba.lcut(line)
        if len(word)>1 and word !='\r\n':
            content.append(word)
    df_content=pd.DataFrame({'content':content})     
    
    stopwords=pd.read_csv("停用词表.txt",index_col=False,sep='\t',quoting=3,names=['stopword'],encoding="utf-8")   
    def drop_stopwords(text,stopwords):
        content_clean = []
        all_words=[]
        for line in text:
            line_clean=[]
            for word in line:
               if word in stopwords or word==" ":
                    continue
               line_clean.append(word)
               all_words.append(str(word))
            content_clean.append(line_clean)
        return content_clean,all_words
    contents=df_content.content.values.tolist()
    stopwords=stopwords.stopword.values.tolist()
    contents_clean,all_words=drop_stopwords(contents,stopwords)
      
    df_content=pd.DataFrame({'contents_clean':contents_clean})    
    df_all_words=pd.DataFrame({'all_words':all_words}) 
    
    words_count=df_all_words.groupby(by=["all_words"])["all_words"].agg([("count",np.size)])
    words_count=words_count.reset_index().sort_values(by=["count"],ascending=False)
    
    
    #词云生成
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    import matplotlib
    matplotlib.rcParams['figure.figsize']=(10.0,5.0)
    
    wordcloud=WordCloud(font_path=r"C:\\Windows\\Fonts\\msyh.ttc",background_color="white",max_font_size=80)
    word_frequence={x[0]:x[1] for x in words_count.head(100).values}
    wordcloud=wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)

headers={
            'User-Agent':UserAgent().random
        }

# with open('豆瓣具体数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
#         writer = csv.writer(fp)
#         writer.writerow(["名字","评分","url","pic_url","导演","编剧","主演","类型","制片国家/地区","语言","上映日期","别名","片长","评价次数","电影简介","短评次数","影评次数"])
data=pd.read_csv("豆瓣链接数据.csv")
#短评
# for i in range(0,data.index.stop):
#     li=data.iloc[i].tolist()
#     url=data.iloc[i]["url"]
    
#     li=[]
#     for j in range(0,3):
#         url1=url+"comments?start={}&limit=20&status=P&sort=new_score".format(j*20)
#         response=requests.get(url=url1,headers=headers)
#         page_text=response.text
#         tree=etree.HTML(page_text)
#         div_list=tree.xpath('//*[@id="comments"]/div')
#         for div in div_list:
#             try:
#                 time=div.xpath("./div[2]/h3/span[2]/span[3]/text()")[0].replace(" ","").replace("\n","")
#             except:pass
#             print(time)
#             comment=div.xpath("./div[2]/p/span//text()")
#             li.extend(comment)
for i in range(0,data.index.stop):
    li=data.iloc[i].tolist()
    url=data.iloc[i]["url"]
    
    li=[]           
            
    for j in range(0,1):
        url1=url+"reviews?start={}".format(j*20)
        response=requests.get(url=url1,headers=headers)
        page_text=response.text
        tree=etree.HTML(page_text)
        div_list=tree.xpath('//*[@id="content"]/div/div[1]/div[1]/div')
        for div in div_list:
            try:
                times=div.xpath("./div/header/span[2]/text()")[0]
            except:pass
            print(times)
            id=div.xpath("./div/div/div/div/a/@id")[0].split("-")[1]
            urls="https://movie.douban.com/j/review/{}/full".format(id)
            res=requests.get(url=urls,headers=headers)
            text=pq(res.json()["html"]).text()
            text=text.replace("\n","")
            li.append(text)
            
            time.sleep(float(format(random.uniform(0,3), '.2f')))
            
        #电影词频图
        # wordcrowd(li)
    break
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

聆听我的召唤,菜鸟进化

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值