基本数据
from lxml import etree
import time
import json
import requests
import csv
import random
import os
from fake_useragent import UserAgent
headers={
'User-Agent':UserAgent().random
}
with open('豆瓣链接数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
writer = csv.writer(fp)
writer.writerow(["title","rate","url","pic_url"])
#热门 最新 经典 可播放 豆瓣高分 冷门佳片 华语 欧美 韩国 日本 动作 喜剧 爱情 科幻 悬疑 恐怖 成长
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
for i in range(1,2):
url='https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=100&page_start={}'.format(i)
response=requests.get(url=url,headers=headers)
data=response.json()
print("第{}条正在爬取".format(i))
cards=data["subjects"]
for card in cards:
li=[]
rate=card["rate"]#评分
title=card["title"]#影名
url=card["url"]#url
pic_url=card["cover"]#pic_url
img_data = requests.get(url=pic_url,headers=headers).content
img_path = 'picLibs/'+title+'.jpg'
with open(img_path,'wb') as fp:
fp.write(img_data)
print(title,'下载成功!!!')
li.append(title)
li.append(rate)
li.append(url)
li.append(pic_url)
with open('豆瓣链接数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
writer = csv.writer(fp)
writer.writerow(li)
time.sleep(float(format(random.uniform(0,3), '.2f')))
集体数据
from lxml import etree
import time
import json
import requests
import csv
import numpy as np
import pandas as pd
import re
import random
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
headers={
'User-Agent':UserAgent().random
}
with open('豆瓣具体数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
writer = csv.writer(fp)
writer.writerow(["名字","评分","url","pic_url","导演","编剧","主演","类型","制片国家/地区","语言","上映日期","别名","片长","评价次数","电影简介","短评次数","影评次数"])
data=pd.read_csv("豆瓣链接数据.csv")
for i in range(0,data.index.stop):
li=data.iloc[i].tolist()
url=data.iloc[i]["url"]
response=requests.get(url=url,headers=headers)
# response.encoding = 'GBK'
page_text=response.text
tree=etree.HTML(page_text)
div_list=tree.xpath('//*[@id="info"]//text()')
texts="".join(div_list).replace("\n ","").replace("\n","")
if "编剧" in texts:
director=texts.split("编剧")[0].split(": ")[1]
screenwriter=texts.split("主演")[0].split("编剧: ")[1]
else:
director=texts.split("主演")[0].split(": ")[1]
screenwriter="nan"
zldy=list(re.findall(r"主演: (.*?)类型: (.*?)国家/地区: (.*?)语言: (.*?)上",texts)[0])
time_place=texts.split("上映日期: ")[1].replace(")",") ").split(" ")[0]
if "又名" in texts:
names1=texts.split("又名: ")[1]
if "IMDb" in names1:
names1=names1.split("IMDb")[0]
else:names1="nan"
if "片长" in texts:
t=texts.split("片长: ")[1].replace("钟","钟 ").split(" ")[0]
else:t="nan"
li.append(director)
li.append(screenwriter)
li.extend(zldy)
li.append(time_place)
li.append(names1)
li.append(t)
comment_count=tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span//text()')[0]
brief="".join(tree.xpath('//*[@id="link-report"]/span//text()'))
if len(brief)!=0:
brief=pq(brief).text()
else:brief="nan"
com=tree.xpath('//*[@id="comments-section"]/div[1]/h2/span/a/text()')[0]
com=re.findall(r"全部 (.*?) 条",com)[0]
com1=tree.xpath('//*[@id="reviews-wrapper"]/header/h2/span/a/text()')[0]
com1=re.findall(r"全部 (.*?) 条",com1)[0]
li.append(comment_count)
li.append(brief)
li.append(com)
li.append(com1)
print(li)
with open('豆瓣具体数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
writer = csv.writer(fp)
writer.writerow(li)
time.sleep(float(format(random.uniform(0,1), '.2f')))
if i==5:
break
具体评论
from lxml import etree
import time
import json
import requests
import csv
import numpy as np
import pandas as pd
import re
import random
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
def wordcrowd(text):
import pandas as pd
import numpy as np
import numpy
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
content=[]
for line in text:
word=jieba.lcut(line)
if len(word)>1 and word !='\r\n':
content.append(word)
df_content=pd.DataFrame({'content':content})
stopwords=pd.read_csv("停用词表.txt",index_col=False,sep='\t',quoting=3,names=['stopword'],encoding="utf-8")
def drop_stopwords(text,stopwords):
content_clean = []
all_words=[]
for line in text:
line_clean=[]
for word in line:
if word in stopwords or word==" ":
continue
line_clean.append(word)
all_words.append(str(word))
content_clean.append(line_clean)
return content_clean,all_words
contents=df_content.content.values.tolist()
stopwords=stopwords.stopword.values.tolist()
contents_clean,all_words=drop_stopwords(contents,stopwords)
df_content=pd.DataFrame({'contents_clean':contents_clean})
df_all_words=pd.DataFrame({'all_words':all_words})
words_count=df_all_words.groupby(by=["all_words"])["all_words"].agg([("count",np.size)])
words_count=words_count.reset_index().sort_values(by=["count"],ascending=False)
#词云生成
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize']=(10.0,5.0)
wordcloud=WordCloud(font_path=r"C:\\Windows\\Fonts\\msyh.ttc",background_color="white",max_font_size=80)
word_frequence={x[0]:x[1] for x in words_count.head(100).values}
wordcloud=wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
headers={
'User-Agent':UserAgent().random
}
# with open('豆瓣具体数据.csv', 'a', newline='', encoding='utf-8-sig') as fp:
# writer = csv.writer(fp)
# writer.writerow(["名字","评分","url","pic_url","导演","编剧","主演","类型","制片国家/地区","语言","上映日期","别名","片长","评价次数","电影简介","短评次数","影评次数"])
data=pd.read_csv("豆瓣链接数据.csv")
#短评
# for i in range(0,data.index.stop):
# li=data.iloc[i].tolist()
# url=data.iloc[i]["url"]
# li=[]
# for j in range(0,3):
# url1=url+"comments?start={}&limit=20&status=P&sort=new_score".format(j*20)
# response=requests.get(url=url1,headers=headers)
# page_text=response.text
# tree=etree.HTML(page_text)
# div_list=tree.xpath('//*[@id="comments"]/div')
# for div in div_list:
# try:
# time=div.xpath("./div[2]/h3/span[2]/span[3]/text()")[0].replace(" ","").replace("\n","")
# except:pass
# print(time)
# comment=div.xpath("./div[2]/p/span//text()")
# li.extend(comment)
for i in range(0,data.index.stop):
li=data.iloc[i].tolist()
url=data.iloc[i]["url"]
li=[]
for j in range(0,1):
url1=url+"reviews?start={}".format(j*20)
response=requests.get(url=url1,headers=headers)
page_text=response.text
tree=etree.HTML(page_text)
div_list=tree.xpath('//*[@id="content"]/div/div[1]/div[1]/div')
for div in div_list:
try:
times=div.xpath("./div/header/span[2]/text()")[0]
except:pass
print(times)
id=div.xpath("./div/div/div/div/a/@id")[0].split("-")[1]
urls="https://movie.douban.com/j/review/{}/full".format(id)
res=requests.get(url=urls,headers=headers)
text=pq(res.json()["html"]).text()
text=text.replace("\n","")
li.append(text)
time.sleep(float(format(random.uniform(0,3), '.2f')))
#电影词频图
# wordcrowd(li)
break