这个爬虫是我一门课程的作业中用到,很久了,偶然想起就分享到这里吧,我写的大部分代码,小部分由我同学完成。
再抓评论过程中被封ip这是个令人头疼的事情,如果你是宽带用户,并且是动态ip,被封ip后直接调用windos的cmd命令重新连接就可以获得新的ip。下面是python代码:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import urllib.request
import json
from bs4 import BeautifulSoup
import os
import time
#根据分类获取下面的歌单id
def get_all_playlistInCategory(cateName):
params = urllib.parse.urlencode({'cat':cateName,'order':'hot'})
url = "https://music.163.com/discover/playlist/?"+params;
header={ #请求头部
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
,'Cookie':'_iuqxldmzr_=32; _ntes_nnid=433d60a9e9521389c81dff82459a7fb3,1528694258228; _ntes_nuid=433d60a9e9521389c81dff82459a7fb3; WM_TID=9jk%2FqW560nZjSbz%2Fq%2FFACrhPWF98XZVg; __utmz=94650624.1529327659.8.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; mail_psc_fingerprint=c44a71802306fb2fc41a05e22d33d555; __f_=1529330844746; __utmc=94650624; JSESSIONID-WYYY=4pq1Unz3u7Wo9sy4stJXWgNtjl4Ja76WoEnKwgZnNH16bYapNYRU9yfB3MkC6d3o16hueqD318OBcRcSJfnoQqc19R%2BnrFn1n2wBgjd7ObSFftB%5CdrW2pK3Feo9X6O8%5CCvE6oVJSq3E92nwpFDuVkBEvwFX33wb%5CCszGA85CZhM%5C9p2b%3A1529415671794; __utma=94650624.678937187.1528694258.1529410392.1529414020.10; __utmb=94650624.15.10.1529414020'
}
request=urllib.request.Request(url=url, headers=header)
html=urllib.request.urlopen(request).read().decode('utf8') #打开url
html=str(html) #转换成str
bs = BeautifulSoup(html)
alabels = bs.select("p[class='dec'] a")
result = str(alabels)
pat2=r'<a class=".*?" href="/playlist\?id=(\d*?)" title=".*?">.*?</a>'
playlist_ids = re.compile(pat2).findall(result)
return playlist_ids
#根据歌单id获取下面的歌曲id
def get_all_songInPlaylist(playlist_id): #获取热歌榜所有歌曲名称和id
url='https://music.163.com/playlist?id='+playlist_id #网易云云音乐热歌榜url
header={ #请求头部
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
,'Cookie':'_iuqxldmzr_=32; _ntes_nnid=433d60a9e9521389c81dff82459a7fb3,1528694258228; _ntes_nuid=433d60a9e9521389c81dff82459a7fb3; WM_TID=9jk%2FqW560nZjSbz%2Fq%2FFACrhPWF98XZVg; __utmz=94650624.1529327659.8.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; mail_psc_fingerprint=c44a71802306fb2fc41a05e22d33d555; __f_=1529330844746; __utmc=94650624; __utma=94650624.678937187.1528694258.1529410392.1529414020.10; JSESSIONID-WYYY=aGpbUZjpK86fHy9gb0%2BTjncv2IU1WcYpY4kdm%2BEZkb6F5zEwBuId04hKxUFmDM0qkRH6IIppMxA2WHyS9%2FuuzA1745NxVX8uFcd%2FYVsGur3wBYdZ%2F%2FZNir%2Ft1hVU0aBwHt4c33rosmPf%2FO%2F%5CC2g8jMM6WhzC3Bw%5COjPdCX9kZpTVQiXc%3A1529417412814; __utmb=94650624.25.10.1529414020'
}
request=urllib.request.Request(url=url, headers=header)
html=urllib.request.urlopen(request).read().decode('utf8') #打开url
html=str(html) #转换成str
bs = BeautifulSoup(html)
result = bs.select("ul[class='f-hide'] li a")
result = str(result)
pat2=r'<a href="/song\?id=\d*?">(.*?)</a>' #进行歌名筛选的正则表达式
pat3=r'<a href="/song\?id=(\d*?)">.*?</a>' #进行歌ID筛选的正则表达式
hot_song_name=re.compile(pat2).findall(result) #获取所有热门歌曲名称
hot_song_id=re.compile(pat3).findall(result) #获取所有热门歌曲对应的Id
return hot_song_name,hot_song_id
def get_hotComments(hot_song_name,hot_song_id,comment_num,catName,songids_list):
if hot_song_id in songids_list:
print("遇到重复歌曲...跳过")
return comment_num
else:
songids_list.append(hot_song_id)
url='http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + hot_song_id + '?csrf_token=' #歌评url
header={ #请求头部
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
,'Cookie':'_iuqxldmzr_=32; _ntes_nnid=433d60a9e9521389c81dff82459a7fb3,1528694258228; _ntes_nuid=433d60a9e9521389c81dff82459a7fb3; WM_TID=9jk%2FqW560nZjSbz%2Fq%2FFACrhPWF98XZVg; __utmz=94650624.1529327659.8.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; mail_psc_fingerprint=c44a71802306fb2fc41a05e22d33d555; __f_=1529330844746; __utmc=94650624; __utma=94650624.678937187.1528694258.1529410392.1529414020.10; JSESSIONID-WYYY=aGpbUZjpK86fHy9gb0%2BTjncv2IU1WcYpY4kdm%2BEZkb6F5zEwBuId04hKxUFmDM0qkRH6IIppMxA2WHyS9%2FuuzA1745NxVX8uFcd%2FYVsGur3wBYdZ%2F%2FZNir%2Ft1hVU0aBwHt4c33rosmPf%2FO%2F%5CC2g8jMM6WhzC3Bw%5COjPdCX9kZpTVQiXc%3A1529417412814; __utmb=94650624.29.10.1529414020'
}
#post请求表单数据
data={'params':'zC7fzWBKxxsm6TZ3PiRjd056g9iGHtbtc8vjTpBXshKIboaPnUyAXKze+KNi9QiEz/IieyRnZfNztp7yvTFyBXOlVQP/JdYNZw2+GRQDg7grOR2ZjroqoOU2z0TNhy+qDHKSV8ZXOnxUF93w3DA51ADDQHB0IngL+v6N8KthdVZeZBe0d3EsUFS8ZJltNRUJ','encSecKey':'4801507e42c326dfc6b50539395a4fe417594f7cf122cf3d061d1447372ba3aa804541a8ae3b3811c081eb0f2b71827850af59af411a10a1795f7a16a5189d163bc9f67b3d1907f5e6fac652f7ef66e5a1f12d6949be851fcf4f39a0c2379580a040dc53b306d5c807bf313cc0e8f39bf7d35de691c497cda1d436b808549acc'}
postdata=urllib.parse.urlencode(data).encode('utf8') #进行编码
request=urllib.request.Request(url,headers=header,data=postdata)
reponse=urllib.request.urlopen(request)
result = reponse.read().decode('utf8')
json_dict=json.loads(result) #获取json
#过滤掉评论少于999的歌曲
if 'total' not in json_dict:
raise ConnectionResetError()
return comment_num
if int(json_dict['total']) <999:
print("评论少于999跳过...")
return comment_num
hot_commit=json_dict['hotComments'] #获取json中的热门评论
num=0
fhandle=open('./song_comments.txt','a',encoding='utf-8') #写入文件
for item in hot_commit:
num+=1
likedCount=str(item['likedCount'])
time = str(item['time'])
#对评论进行处理 去掉换行符,去掉首尾的空格
content = item['content'].strip().replace("\n","")
#歌曲风格 歌曲名 评论内容 点赞数 时间 中间以||作为分割
fhandle.write(catName+'||'+hot_song_name+'||'+content+"||"+likedCount+"||"+time+"\n")
comment_num = comment_num+1
fhandle.close()
print("comment_num:",comment_num)
reponse.close()
return comment_num
#重新连接宽带命令
#rasdial 连接名字 账号 密码
#rasdial 连接名字 /disconnect
#主程序
count = 0 #记录抓取歌曲的数量
#需要抓取的分类
cat_list = ["流行","摇滚","民谣","电子","轻音乐", "乡村","古典","金属","朋克","另类/独立", "古风","后摇","英伦","New Age"]
songids_list = []
#遍历分类
for catName in cat_list:
comment_num_inCat = 0
playlist_ids = get_all_playlistInCategory(catName)
#遍历分类下歌单
for i in range(len(playlist_ids)):
hot_song_name,hot_song_id = get_all_songInPlaylist(playlist_ids[i])
#遍历分类下歌曲
for j in range(len(hot_song_id)):
print('正在抓取歌单id为',playlist_ids[i],'中的第',(j+1),'首歌曲热评...')
try:
comment_num_inCat = get_hotComments(hot_song_name[j], hot_song_id[j],comment_num_inCat,catName,songids_list)
except ConnectionResetError as c:
print("ip被禁止...尝试重新连接宽带得到新的ip")
os.system("rasdial 宽带连接 /disconnect")
#断开后等待3秒
time.sleep(3)
os.system("rasdial 宽带连接 [账户] [密码]")#输入账户或者密码,如果不需要的话应该忽略就可以
print('执行成功...')
count=count+1
if comment_num_inCat >= 10000:
break
if comment_num_inCat >= 10000:
print("分类:",catName,"下评论已经达到10000,停止抓取....")
break
print("共抓取歌数量为",count)