爬虫 & 数据分析
运行环境:python3.6
为方便理解,网易云音乐热评的爬取代码分为两部分
1. 先爬取每个歌单里的歌曲的url,导出到music1_01.csv文件中
2. 爬取每首歌的热评信息,导出到hotCommets_01.csv文件中
- music_01.ipynb
import logging
import requests
from pyquery import PyQuery as pq
import pandas as pd
import random
import time
# headers需要填上,否则无法正常爬取
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
# 设置日志的格式、输出级别
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
def scrape_index(url):
response = requests.get(url,headers = headers )
logging.info('scrape index %s...',url) #不需要再url前加%,而是,
try:
if response.status_code == 200:
return parse_index(response.text) # 传到parse_index 方法中获取歌单url列表
else :
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True) # exc_info=True:会将异常异常信息添加到日志消息中
def parse_index(html):
doc = pq(html) # 用pyquery进行解析
a = doc('#m-pl-container .dec .s-fc0') # #对应div .对应class
a1 = a.items() # 对于返回值是多个元素,然后对每个元素做处理,需要调用items方法,返回的generator类型,可以通过for 循环去取值
return a1
def scrape_detail(url):
response = requests.get(url,headers = headers )
logging.info('scraping detail %s...',url)
try:
if response.status_code == 200:
logging.info('detail url is succeed ')
return parse_detail(response.json()) # API获取的内容返回的是json格式
else:
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True)
'''
热评获取API:http://music.163.com/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0