贴一下抓到的评论,最后几条的
本渣是在校学生(非计算机相关),今年上半年接触Python,然后暑假开始学着写爬虫。
新手,本文如有错误,谢谢指正。
这个爬虫开始思路有点乱,现在比较清晰了
思路:requests库获取html
在html中找到翻页方法和评论总条数(评论总条数方便我查看爬取进度)
然后再写一个函数获取到了评论信息
这里多说一句,爬取豆瓣实现翻页得在html中找到下一页url的一部分
最后成功了,但是打开txt文件,发现有10%左右的评论都是乱码,其实我设置了编码处理的,最后文件也是用的utf-8写入:
#网页编码
r.encoding = r.apparent_encoding
#写入文件
with open(fpath, 'a', encoding='utf-8') as f:
在网上找了很多文档和分享,明白了requests库设计初衷就是希望用户自己来修改编码,然后我直接把编码改了
r.encoding = 'utf-8'
最后出现的问题是,翻页的函数总是报错,找不到对应的标签。最后我想了一下,如果是获取评论,就按照utf-8的编码来;如果是翻页,还是用r.encoding = r.apparent_encoding
try:
if flag == 1: #如果传入的是获取评论信息的url
r.encoding = 'utf-8' #尝试改变编码
elif flag == 2:
r.encoding = r.apparent_encoding
return r.text
except:
return ""
最后中文乱码的问题解决了。
然而,又碰到了验证码了。。。终于,数次之后,我的ip好像被封了
我找了个ip代理网站,弄代理ip,这里参考了Python爬虫技巧之设置代理IP
但是疏忽了一点,不能一直访问这个网站获取ip,然后访问次数过多,导致ip被代理ip网站给封了,4000+评论在2000+的时候报错了。
然后我在那个代理网站里随便找个一个ip,再去获取ip,生成列表
url = 'http://www.xicidaili.com/nn/'
ipAgent = {
'http':'http://221.214.181.98:53281'
}
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
web_data = requests.get(url, headers=headers, proxies = ipAgent)
最后终于可以愉快地玩耍了。
爬取评论的全部代码如下(两个文件),header是我用一个浏览器插件生成的,所以很长,不过也很方便,是参考了知乎一位大佬的Python爬虫传送post请求要携带哪些参数?
#第一个文件
#CrawDoubanComments.py
#下面的from cons import headerChange是我写的一个随机取user-agent的代码文件
import requests
import os
from bs4 import BeautifulSoup
import bs4
from cons import headerChange
import random
def get_ip_list():
url = 'http://www.xicidaili.com/nn/'
ipAgent = {
'http':'http://221.214.181.98:53281'
}
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
web_data = requests.get(url, headers=headers, proxies = ipAgent)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
ipList = get_ip_list()
print(ipList)
def getHTMLTxt(url, flag):
querystring = {"status":"P"}
headers = {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.8",
'cache-control': "no-cache",
'connection': "keep-alive",
'cookie': "ll=\"118161\"; bid=hvtlFYn8N2g; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1502935063%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; __yadk_uid=4glVfZfkxqS5u5wHymY2aa0dEnqhZ1GL; ps=y; dbcl2=\"142710097:FLM/J8N4I1w\"; ck=12Nj; _vwo_uuid_v2=3474CBA32AC36EAA99D3D339FC358DBA|16192af0fbe498c7433958350c7be868; __utma=30149280.810866807.1502935064.1502935064.1502935064.1; __utmb=30149280.3.9.1502935124997; __utmc=30149280; __utmz=30149280.1502935064.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmv=30149280.14271; __utma=223695111.302797075.1502935064.1502935064.1502935064.1; __utmb=223695111.0.10.1502935064; __utmc=223695111; __utmz=223695111.1502935064.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); _pk_id.100001.4cf6=12c01e44b4266fee.1502935063.1.1502936044.1502935063.; _pk_ses.100001.4cf6=*; push_noty_num=0; push_doumail_num=0",
'host': "movie.douban.com",
'referer': "https://movie.douban.com/subject/26392671/",
'upgrade-insecure-requests': "1",
'user-agent': headerChange(),
'postman-token': "e1715f99-6029-83ce-5574-3f4e6148b96c"
}
proxies = get_random_ip(ipList)
r = requests.request("GET", url, headers=headers, params=querystring, proxies = proxies)
try:
if flag == 1: #如果传入的是获取评论信息的url
r.encoding = 'utf-8' #尝试改变编码
elif flag == 2:
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getComments(url, cmt): #获取一整页的评论
count = 0
html = getHTMLTxt(url, 1)
soup = BeautifulSoup(html, "html.parser")
for tagDiv in soup.find_all('div', attrs={'class':'comment'}):
if isinstance(tagDiv, bs4.element.Tag):
infoComt = tagDiv.find('p')
cmt.append(infoComt.string)
count = count + 1
return count
def nextPageURL(urlPre): #根据前一页的url返回的html中找到后一页的url
html = getHTMLTxt(urlPre, 2)
try:
soup = BeautifulSoup(html, "html.parser")
divTag = soup.find('div', attrs={'id':'paginator', 'class':'center'}) #找到存有下一页url的div标签
aTag = divTag('a') #找到存有信息的a标签
index = len(aTag) #下一页的url信息在最后一个标签内
return aTag[index - 1].get('href')
except:
return -1
def countOfComments(urlStart): #根据首页返回的html拿到评论的总条数
try:
html = getHTMLTxt(urlStart, 1)
soup = BeautifulSoup(html, "html.parser")
#下面代码在各标签中查找到评论的总条数
liTag = soup.find('li', attrs={'class':'is-active'})
spanTag = liTag('span')
strPage = spanTag[0].string
return eval(strPage[-5:-1]) #这里的切片根据评论总条数的位数来定
except:
return 0
def main():
urlStart = "https://movie.douban.com/subject/10477598/comments?" #此处comments后的都不要有,包括'?'
depth = countOfComments(urlStart)
#depth = 1000 #手动设置评论条数,控制时间
fpath = "D:\PY\CrawDoubanComments\comments.txt"
cmt = []
count = 0
if os.path.exists('D:\PY\CrawDoubanComments\comments.txt'):
os.remove('D:\PY\CrawDoubanComments\comments.txt') #因为需要测试,懒得手动删除,所以程序开始时对文件进行删除操作
urlNext = ""
print("爬取评论")
while(count < depth and len(cmt) < depth):
url = urlStart + urlNext
count = count + getComments(url, cmt)
urlNext = nextPageURL(url)
if urlNext == -1:
print("意外终止!")
break
else:
print('\r正在爬取评论,进度:{:.2f}%'.format((count * 100) / depth), end = "")
print("\n爬取成功,进行储存。。。")
for i in range(len(cmt)):
try:
with open(fpath, 'a', encoding='utf-8') as f:
f.write(cmt[i] + '\n')
print('\r正在储存评论,进度:第{}条,共{}条'.format(i + 1, len(cmt)), end = "")
except:
continue
print("\n评论储存完毕!")
main()
#第二个文件,获取user-agent
#cons.py
import random
#在网上找了多个user-agent,然后每次访问时利用随机库在其中随机选择一个
#
headerstr = '''Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'''
def headerChange():
headerList = headerstr.split('\n')
length = len(headerList)
return headerList[random.randint(0,length - 1)]
最后还有生成词云的文件
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from scipy.misc import imread
import jieba
from os import path
text = open("comments.txt","rb").read()
#结巴分词
wordlist = jieba.cut(text,cut_all=True)
wl = " ".join(wordlist)
coloring = imread(path.join("罗小黑.png"))
#设置词云
wc = WordCloud(background_color = "white", #设置背景颜色
mask = coloring, #设置背景图片
max_words = 10000, #设置最大显示的字数
#stopwords = "", #设置停用词
font_path = "C:\\Windows\Fonts\\simhei.ttf",
#设置中文字体,使得词云可以显示(词云默认字体是“DroidSansMono.ttf字体库”,不支持中文)
max_font_size = 40, #设置字体最大值
random_state = 42, #设置有多少种随机生成状态,即有多少种配色方案
)
myword = wc.generate(wl)#生成词云
image_colors = ImageColorGenerator(coloring) #得到图片
#展示词云图
#plt.imshow(myword) #随机的颜色,没有与背景一样
# 绘制词云
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
# 绘制背景图片为颜色的图片
plt.figure()
plt.imshow(coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()