Python爬虫与数据分析
- 目的
爬取网易云音乐歌曲热评,分析热评特征。 - 思路
(1)爬取华语歌单中所有歌单url
(2)从每篇歌单地址中爬取每首歌的url
(3)从每首歌的首页爬取热评 - 代码
(1) 爬取华语歌单中所有歌单url,从歌单url获取歌单中每首歌的歌名和id,并保存到文件music_163_02.csv。
import logging
import requests
from pyquery import PyQuery as pq
import pandas as pd
import random
import time
# headers需要填上,否则无法正常爬取
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
# 设置日志的格式、输出级别
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
def scrape_index(url):
response = requests.get(url,headers = headers )
logging.info('scrape index %s...',url) #不需要再url前加%,而是,
try:
if response.status_code == 200:
return parse_index(response.text) # 传到parse_index 方法中获取歌单url列表
else :
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True) # exc_info=True:会将异常异常信息添加到日志消息中
def parse_index(html):
doc = pq(html) # 用pyquery进行解析
a = doc('#m-pl-container .dec .s-fc0') # #对应div .对应class
a1 = a.items() # 对于返回值是多个元素,然后对每个元素做处理,需要调用items方法,返回的generator类型,可以通过for 循环去取值
return a1
def scrape_detail(url):
response = requests.get(url,headers = headers )
logging.info('scraping detail %s...',url)
try:
if response.status_code == 200:
logging.info('detail url is succeed ')
return parse_detail(response.json()) # API获取的内容返回的是json格式
else:
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True)
'''
热评获取API:http://music.163.com/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0
所以获取歌曲的ID就可以得到热评
'''
def parse_detail(html):
list_02 = []
jobs = html['result']['tracks']
for j in jobs:
dic ={}
dic['name'] = j['name'] # 创建 字典
dic['id'] = j['id']
list_02.append(dic