分享一个抓取网易云不同分类下歌曲热评的python爬虫,并且不怕被封ip(非使用代理)

这个爬虫是我一门课程的作业中用到,很久了,偶然想起就分享到这里吧,我写的大部分代码,小部分由我同学完成。

再抓评论过程中被封ip这是个令人头疼的事情,如果你是宽带用户,并且是动态ip,被封ip后直接调用windos的cmd命令重新连接就可以获得新的ip。下面是python代码:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import urllib.request
import json
from bs4 import BeautifulSoup
import os
import time
#根据分类获取下面的歌单id
def get_all_playlistInCategory(cateName):
    params = urllib.parse.urlencode({'cat':cateName,'order':'hot'})
    url = "https://music.163.com/discover/playlist/?"+params;
    header={    #请求头部
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
        ,'Cookie':'_iuqxldmzr_=32; _ntes_nnid=433d60a9e9521389c81dff82459a7fb3,1528694258228; _ntes_nuid=433d60a9e9521389c81dff82459a7fb3; WM_TID=9jk%2FqW560nZjSbz%2Fq%2FFACrhPWF98XZVg; __utmz=94650624.1529327659.8.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; mail_psc_fingerprint=c44a71802306fb2fc41a05e22d33d555; __f_=1529330844746; __utmc=94650624; JSESSIONID-WYYY=4pq1Unz3u7Wo9sy4stJXWgNtjl4Ja76WoEnKwgZnNH16bYapNYRU9yfB3MkC6d3o16hueqD318OBcRcSJfnoQqc19R%2BnrFn1n2wBgjd7ObSFftB%5CdrW2pK3Feo9X6O8%5CCvE6oVJSq3E92nwpFDuVkBEvwFX33wb%5CCszGA85CZhM%5C9p2b%3A1529415671794; __utma=94650624.678937187.1528694258.1529410392.1529414020.10; __utmb=94650624.15.10.1529414020'
    }
    request=urllib.request.Request(url=url, headers=header)
    html=urllib.request.urlopen(request).read().decode('utf8')   #打开url
    html=str(html)     #转换成str
    bs = BeautifulSoup(html)
    alabels  = bs.select("p[class='dec'] a")
    result = str(alabels)
    pat2=r'<a class=".*?" href="/playlist\?id=(\d*?)" title=".*?">.*?</a>'
    playlist_ids = re.compile(pat2).findall(result)
    return playlist_ids


#根据歌单id获取下面的歌曲id
def get_all_songInPlaylist(playlist_id):     #获取热歌榜所有歌曲名称和id
    url='https://music.163.com/playlist?id='+playlist_id    #网易云云音乐热歌榜url
    header={    #请求头部
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
        ,'Cookie':'_iuqxldmzr_=32; _ntes_nnid=433d60a9e9521389c81dff82459a7fb3,1528694258228; _ntes_nuid=433d60a9e9521389c81dff82459a7fb3; WM_TID=9jk%2FqW560nZjSbz%2Fq%2FFACrhPWF98XZVg; __utmz=94650624.1529327659.8.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; mail_psc_fingerprint=c44a71802306fb2fc41a05e22d33d555; __f_=1529330844746; __utmc=94650624; __utma=94650624.678937187.1528694258.1529410392.1529414020.10; JSESSIONID-WYYY=aGpbUZjpK86fHy9gb0%2BTjncv2IU1WcYpY4kdm%2BEZkb6F5zEwBuId04hKxUFmDM0qkRH6IIppMxA2WHyS9%2FuuzA1745NxVX8uFcd%2FYVsGur3wBYdZ%2F%2FZNir%2Ft1hVU0aBwHt4c33rosmPf%2FO%2F%5CC2g8jMM6WhzC3Bw%5COjPdCX9kZpTVQiXc%3A1529417412814; __utmb=94650624.25.10.1529414020'
    }
    request=urllib.request.Request(url=url, headers=header)
    html=urllib.request.urlopen(request).read().decode('utf8')   #打开url
    html=str(html)     #转换成str
    bs = BeautifulSoup(html)
    result  = bs.select("ul[class='f-hide'] li a")
    result = str(result)
    pat2=r'<a href="/song\?id=\d*?">(.*?)</a>' #进行歌名筛选的正则表达式
    pat3=r'<a href="/song\?id=(\d*?)">.*?</a>'  #进行歌ID筛选的正则表达式
    hot_song_name=re.compile(pat2).findall(result)    #获取所有热门歌曲名称
    hot_song_id=re.compile(pat3).findall(result)    #获取所有热门歌曲对应的Id
    return hot_song_name,hot_song_id


def get_hotComments(hot_song_name,hot_song_id,comment_num,catName,songids_list):
    if hot_song_id in songids_list:
        print("遇到重复歌曲...跳过")
        return comment_num
    else:
        songids_list.append(hot_song_id)
    url='http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + hot_song_id + '?csrf_token='   #歌评url
    header={    #请求头部
   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
    ,'Cookie':'_iuqxldmzr_=32; _ntes_nnid=433d60a9e9521389c81dff82459a7fb3,1528694258228; _ntes_nuid=433d60a9e9521389c81dff82459a7fb3; WM_TID=9jk%2FqW560nZjSbz%2Fq%2FFACrhPWF98XZVg; __utmz=94650624.1529327659.8.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; mail_psc_fingerprint=c44a71802306fb2fc41a05e22d33d555; __f_=1529330844746; __utmc=94650624; __utma=94650624.678937187.1528694258.1529410392.1529414020.10; JSESSIONID-WYYY=aGpbUZjpK86fHy9gb0%2BTjncv2IU1WcYpY4kdm%2BEZkb6F5zEwBuId04hKxUFmDM0qkRH6IIppMxA2WHyS9%2FuuzA1745NxVX8uFcd%2FYVsGur3wBYdZ%2F%2FZNir%2Ft1hVU0aBwHt4c33rosmPf%2FO%2F%5CC2g8jMM6WhzC3Bw%5COjPdCX9kZpTVQiXc%3A1529417412814; __utmb=94650624.29.10.1529414020'
    }
    #post请求表单数据
    data={'params':'zC7fzWBKxxsm6TZ3PiRjd056g9iGHtbtc8vjTpBXshKIboaPnUyAXKze+KNi9QiEz/IieyRnZfNztp7yvTFyBXOlVQP/JdYNZw2+GRQDg7grOR2ZjroqoOU2z0TNhy+qDHKSV8ZXOnxUF93w3DA51ADDQHB0IngL+v6N8KthdVZeZBe0d3EsUFS8ZJltNRUJ','encSecKey':'4801507e42c326dfc6b50539395a4fe417594f7cf122cf3d061d1447372ba3aa804541a8ae3b3811c081eb0f2b71827850af59af411a10a1795f7a16a5189d163bc9f67b3d1907f5e6fac652f7ef66e5a1f12d6949be851fcf4f39a0c2379580a040dc53b306d5c807bf313cc0e8f39bf7d35de691c497cda1d436b808549acc'}
    postdata=urllib.parse.urlencode(data).encode('utf8')  #进行编码
    request=urllib.request.Request(url,headers=header,data=postdata)
    reponse=urllib.request.urlopen(request)
    result = reponse.read().decode('utf8')
    json_dict=json.loads(result)   #获取json
    #过滤掉评论少于999的歌曲
    if 'total' not in json_dict:
        raise ConnectionResetError()
        return comment_num
    if int(json_dict['total']) <999:
        print("评论少于999跳过...")
        return comment_num
    
    hot_commit=json_dict['hotComments']  #获取json中的热门评论
    num=0
    fhandle=open('./song_comments.txt','a',encoding='utf-8')  #写入文件
    for item in hot_commit:
        num+=1
        likedCount=str(item['likedCount'])
        time = str(item['time'])
        #对评论进行处理 去掉换行符,去掉首尾的空格
        content = item['content'].strip().replace("\n","")
        #歌曲风格 歌曲名 评论内容  点赞数 时间 中间以||作为分割
        fhandle.write(catName+'||'+hot_song_name+'||'+content+"||"+likedCount+"||"+time+"\n")
        comment_num = comment_num+1
    fhandle.close()
    print("comment_num:",comment_num)
    reponse.close()
    return comment_num


#重新连接宽带命令
#rasdial 连接名字 账号 密码
#rasdial 连接名字 /disconnect

#主程序
count = 0 #记录抓取歌曲的数量
#需要抓取的分类
cat_list = ["流行","摇滚","民谣","电子","轻音乐", "乡村","古典","金属","朋克","另类/独立", "古风","后摇","英伦","New Age"]
songids_list = []
#遍历分类
for catName in cat_list:
    comment_num_inCat = 0
    playlist_ids = get_all_playlistInCategory(catName)
    #遍历分类下歌单
    for i in range(len(playlist_ids)):
        hot_song_name,hot_song_id = get_all_songInPlaylist(playlist_ids[i])
        #遍历分类下歌曲
        for j in range(len(hot_song_id)):
            print('正在抓取歌单id为',playlist_ids[i],'中的第',(j+1),'首歌曲热评...')
            try:
                comment_num_inCat = get_hotComments(hot_song_name[j], hot_song_id[j],comment_num_inCat,catName,songids_list)
            except ConnectionResetError as c:
                print("ip被禁止...尝试重新连接宽带得到新的ip")
                os.system("rasdial 宽带连接 /disconnect")
                #断开后等待3秒
                time.sleep(3)  
                os.system("rasdial 宽带连接 [账户] [密码]")#输入账户或者密码,如果不需要的话应该忽略就可以
            print('执行成功...')
            count=count+1
            if comment_num_inCat >= 10000:
                break
        if comment_num_inCat >= 10000:
            print("分类:",catName,"下评论已经达到10000,停止抓取....")
            break
            
print("共抓取歌数量为",count)

 

评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值