网易云音乐歌单爬取

最新推荐文章于 2024-10-21 13:22:35 发布

ଲଇଉକ ଲ ̊ଳ

最新推荐文章于 2024-10-21 13:22:35 发布

阅读量1k

点赞数 9

文章标签： python 网络爬虫

本文链接：https://blog.csdn.net/qq_62949330/article/details/134045569

版权

一、实验题目

大作业：网易云音乐歌单爬取

爬取任一自己感兴趣的网页，获取不少于500条的数据并且进行数据清洗、数据统计、数据处理等。

二、实验目的和要求

大作业：

1 、目的：掌握爬虫基础语法

2 、要求：掌握页面跳转，数据存储，scrapy和xpath等的运用

三、实验内容

代码

import scrapy

from gc import callbacks

from re import I

from urllib.parse import urljoin

import logging

logger=logging.getLogger(__name__)

# 导入深拷贝包，用于在爬取多个页面时保存到pipeline中的歌单信息顺序不会乱，防止出现重复，非常关键

from copy import deepcopy

from ..items import WyymusicItem

class MusicListSpider(scrapy.Spider):

name = "MusicList" # 必须要写name属性，在pipeline.py中会用到

allowed_domains = ["music.163.com"] # 设置爬虫爬取范围

start_urls = ["https://music.163.com/discover/playlist"] # 起始爬取的页面，即歌单第一面

offset = 0 # 自己设置的一个指针，用于记录当前爬取的页码

# f = open('MusicList.csv', 'a+', encoding='utf-8',newline='')

# csv_writer = csv.writer(f)

# csv_writer.writerow(['id号','歌单名','播放量','标签名','url','收藏量','转发量','评论量','歌曲数量','创建日期','作者'])

# f.close()

def parse(self, response):

# 使用.xpath语法来从HTML页面中解析需要的信息

# 获取一页中的全部歌单，保存到liList中

liList = response.xpath("//div[@id='m-disc-pl-c']/div/ul[@id='m-pl-container']/li")

# 对liList中的歌单，一个一个遍历，获取歌单详细页面的信息

for li in liList:

itemML = WyymusicItem()

a_href = li.xpath("./div/a[@class = 'msk']/@href").extract_first()

itemML["SongsListID"] = a_href[13:]

# 获取歌单详细页面的Url地址

Url = "https://music.163.com" + a_href

itemML["Url"] = Url

# 调用SongsListPageParse来获取歌单详细页面的信息

yield scrapy.Request(Url, callback=self.SongsListPageParse, meta={"itemML": deepcopy(itemML)})

# 爬取下一页

if self.offset < 37:

self.offset += 1

# 获取下一页的Url地址

nextpage_a_url = "https://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=" + str(self.offset * 35)

print(self.offset, nextpage_a_url)

yield scrapy.Request(nextpage_a_url, callback=self.parse)

print("开始爬下一页")

# 用于爬取每一个歌单中的详细页面信息

def SongsListPageParse(self, response):

# f = open('MusicList.csv', 'a+', encoding='utf-8',newline='')

# csv_writer = csv.writer(f)

cntc = response.xpath("//div[@class='cntc']")

itemML = response.meta["itemML"]

SongListName = cntc.xpath("./div[@class='hd f-cb']/div/h2//text()").extract_first()

itemML["SongListName"] = SongListName # 获取歌单名

user_url = cntc.xpath("./div[@class='user f-cb']/span[@class='name']/a/@href").extract_first()

user_id = user_url[14:]

itemML["AuthorID"] = user_id # 获取歌单创作者id号

time = cntc.xpath("./div[@class='user f-cb']/span[@class='time s-fc4']/text()").extract_first()

itemML["CreationDate"] = time[0:10] # 获取歌单创建日期

aList = cntc.xpath("./div[@id='content-operation']/a")

Collection = aList[2].xpath("./@data-count").extract_first()

itemML["Collection"] = Collection # 获取收藏量

Forwarding = aList[3].xpath("./@data-count").extract_first()

itemML["Forwarding"] = Forwarding # 获取转发量

Comment = aList[5].xpath("./i/span[@id='cnt_comment_count']/text()").extract_first()

itemML["Comment"] = Comment # 获取评论量

tags = ""

tagList = cntc.xpath("./div[@class='tags f-cb']/a")

for a in tagList:

tags = tags + a.xpath("./i/text()").extract_first() + " "

itemML["Labels"] = tags

songtbList = response.xpath("//div[@class='n-songtb']/div")

NumberOfSongs = songtbList[0].xpath(

"./span[@class='sub s-fc3']/span[@id='playlist-track-count']/text()").extract_first()

itemML["NumberOfSongs"] = NumberOfSongs

AmountOfPlay = songtbList[0].xpath("./div[@class='more s-fc3']/strong[@id='play-count']/text()").extract_first()

itemML["AmountOfPlay"] = AmountOfPlay

# data = [itemML["SongsListID"], itemML["SongListName"], itemML["AmountOfPlay"], itemML["Labels"],

# itemML["Collection"],itemML["Forwarding"], itemML["Comment"],

# itemML["NumberOfSongs"], itemML["CreationDate"], itemML["AuthorID"]]

# print(data)

data = [itemML['SongsListID'], itemML['SongListName'], itemML['AmountOfPlay'], itemML["Labels"],

itemML['Collection'],

itemML['Forwarding'], itemML['Comment'], itemML['NumberOfSongs'], itemML['CreationDate'],

itemML['AuthorID']]

# csv_writer.writerow(data)

yield itemML # 将爬取的信息传给 pipelines.py

from asyncio.log import logger

from itemadapter import ItemAdapter

import csv

import os

import logging

logger=logging.getLogger(__name__)

class WyymusicPipeline:

def __init__(self):

store_file = os.path.dirname(__file__) + '/Musiclist.csv'

self.file = open(store_file, 'a+', encoding='utf-8', newline='')

self.writer = csv.writer(self.file, dialect='excel')

self.writer.writerow(['id号','歌单名','播放量','标签名','收藏量','转发量','评论量','歌曲数量','创建日期','作者'])

# 每次引擎将数据转发都会调用process_item方法，item参数就是转发过来的数据

def process_item(self, itemML, spider):

logger.warn('正在存储……')

logger.warn(itemML)

if itemML['SongListName']:

data = [itemML['SongsListID'], itemML['SongListName'], itemML['AmountOfPlay'], itemML["Labels"], itemML['Collection'],

itemML['Forwarding'], itemML['Comment'],itemML['NumberOfSongs'], itemML['CreationDate'], itemML['AuthorID']]

self.writer.writerow(data)

return itemML

def close_spider(self, spider):

self.file.close()

import pandas as pd

import numpy as np

# pd.set_option('display.max_columns',None)

df=pd.read_csv('Musiclist.csv', encoding='utf-8')

# print(df)

f = df.sort_values(by=["播放量"], ascending=False)

f['评论量']=f['评论量'].replace('评论',0)#把评论量这一列出现评论的改为0

f.to_csv('list.csv',encoding='utf-8',index=False)

运行结果截图：

图 1原始数据

图 2原始数据

图 3清洗后

图 4 清洗后

图 5