import re
import requests
from urllib.error import HTTPError
import csv
def get_content(url):
try:
user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
response=requests.get(url,headers={'User-Agent':user_agent})
response.raise_for_status()
response_encoding=response.apparent_encoding
except requests.exceptions.HTTPError as e:
print('爬取失败',e)
return ''
except HTTPError as e:
print('爬取失败',e)
return ''
else:
print(response.url)
print('爬取成功')
return response.content
def parser_content(html):
import lxml.etree as etree
selector=etree.HTML(html)
typeObj=selector.xpath('//div[@id="nowplaying"]/div[@class="mod-bd"]')[0]
moviesObj=typeObj.xpath('.//ul[@class="lists"]/li')
new_urls=[]
for item in moviesObj:
commentId=item.xpath('.//@id')[0]
name=item.xpath('.//li[@class="stitle"]/a[@class="ticket-btn"]/text()')[0]
new_url='https://movie.douban.com/subject/%s/comments' %(commentId)
new_urls.append((name,new_url))
return new_urls
def parser_comment(html,comments):
import lxml.etree as etree
import re
selector=etree.HTML(html)
comment=str(selector.xpath('//span[@class="short"]/text()'))
nextPage=selector.xpath('//a[contains(text(),"后页")]/@href')
comments.append(comment)
return nextPage
import jieba
import numpy as np
from PIL import Image
from wordcloud import wordcloud
def gen_wordCloud(text,filename):
result=jieba.lcut(text)
imgObj=Image.open('doc/tree.jpg')
img_mask=np.array(imgObj)
wcObj=wordcloud.WordCloud(
font_path='/usr/share/fonts/wqy-zenhei/wqy-zenhei.ttc',
mask=img_mask,
background_color='snow',
min_font_size=5,
max_font_size=50,
)
wcObj.generate(','.join(result))
wcObj.to_file(filename)
def movieSpider():
url='https://movie.douban.com/nowplaying'
content=get_content(url)
urls=parser_content(content)
for item in urls:
comments = []
movies.append(item[0])
url=item[1]
html=get_content(url)
while True and html:
nextPage=parser_comment(html,comments)
if nextPage:
nextPageUrl=url+nextPage[0]
html=get_content(nextPageUrl)
else:
print('爬取完毕')
break
with open('doc/%s.csv' %(item[0].strip()),'w') as f:
writer=csv.writer(f)
writer.writerows(comments)
if __name__ == '__main__':
import csv
movies=[]
# 爬取影评,并保存到文件中
movieSpider()
pattern = re.compile(r'[[\u4e00-\u9fa5]+|[a-zA-Z0-9]+]')
for movie in movies:
text=''
with open('doc/%s.csv' %(movie.strip())) as f:
reader=csv.reader(f)
for row in reader:
text+=''.join(re.findall(pattern,str(row)))
gen_wordCloud(text,'doc/img/%s.png' %(movie))