网易云音乐歌单爬取

     实验题

大作业:网易云音乐歌单爬取

爬取任一自己感兴趣的网页,获取不少于500条的数据并且进行数据清洗、数据统计、数据处理等。

     实验目的和要求

大作业

1  目的:  掌握爬虫基础语法

2 、要求:  掌握页面跳转,数据存储,scrapy和xpath等的运用

     实验内容

  1. 代码

import scrapy

from gc import callbacks

from re import  I

from urllib.parse import urljoin

import logging

logger=logging.getLogger(__name__)

# 导入深拷贝包,用于在爬取多个页面时保存到pipeline中的歌单信息顺序不会乱,防止出现重复,非常关键

from copy import deepcopy

from ..items import WyymusicItem

class MusicListSpider(scrapy.Spider):

    name = "MusicList"  # 必须要写name属性,在pipeline.py中会用到

    allowed_domains = ["music.163.com"]  # 设置爬虫爬取范围

    start_urls = ["https://music.163.com/discover/playlist"]  # 起始爬取的页面,即歌单第一面

    offset = 0  # 自己设置的一个指针,用于记录当前爬取的页码

    # f = open('MusicList.csv', 'a+', encoding='utf-8',newline='')

    # csv_writer = csv.writer(f)

    # csv_writer.writerow(['id','歌单名','播放量','标签名','url','收藏量','转发量','评论量','歌曲数量','创建日期','作者'])

    # f.close()

    def parse(self, response):

        # 使用.xpath语法来从HTML页面中解析需要的信息

        # 获取一页中的全部歌单,保存到liList

        liList = response.xpath("//div[@id='m-disc-pl-c']/div/ul[@id='m-pl-container']/li")

        # liList中的歌单,一个一个遍历,获取歌单详细页面的信息

        for li in liList:

            itemML = WyymusicItem()

            a_href = li.xpath("./div/a[@class = 'msk']/@href").extract_first()

            itemML["SongsListID"] = a_href[13:]

            # 获取歌单详细页面的Url地址

            Url = "https://music.163.com" + a_href

            itemML["Url"] = Url

            # 调用SongsListPageParse来获取歌单详细页面的信息

            yield scrapy.Request(Url, callback=self.SongsListPageParse, meta={"itemML": deepcopy(itemML)})

        # 爬取下一页

        if self.offset < 37:

            self.offset += 1

            # 获取下一页的Url地址

            nextpage_a_url = "https://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=" + str(self.offset * 35)

            print(self.offset, nextpage_a_url)

            yield scrapy.Request(nextpage_a_url, callback=self.parse)

            print("开始爬下一页")

    # 用于爬取每一个歌单中的详细页面信息

    def SongsListPageParse(self, response):

        # f = open('MusicList.csv', 'a+', encoding='utf-8',newline='')

        # csv_writer = csv.writer(f)

        cntc = response.xpath("//div[@class='cntc']")

        itemML = response.meta["itemML"]

        SongListName = cntc.xpath("./div[@class='hd f-cb']/div/h2//text()").extract_first()

        itemML["SongListName"] = SongListName  # 获取歌单名

        user_url = cntc.xpath("./div[@class='user f-cb']/span[@class='name']/a/@href").extract_first()

        user_id = user_url[14:]

        itemML["AuthorID"] = user_id  # 获取歌单创作者id

        time = cntc.xpath("./div[@class='user f-cb']/span[@class='time s-fc4']/text()").extract_first()

        itemML["CreationDate"] = time[0:10]  # 获取歌单创建日期

        aList = cntc.xpath("./div[@id='content-operation']/a")

        Collection = aList[2].xpath("./@data-count").extract_first()

        itemML["Collection"] = Collection  # 获取收藏量

        Forwarding = aList[3].xpath("./@data-count").extract_first()

        itemML["Forwarding"] = Forwarding  # 获取转发量

        Comment = aList[5].xpath("./i/span[@id='cnt_comment_count']/text()").extract_first()

        itemML["Comment"] = Comment  # 获取评论量

        tags = ""

        tagList = cntc.xpath("./div[@class='tags f-cb']/a")

        for a in tagList:

            tags = tags + a.xpath("./i/text()").extract_first() + " "

        itemML["Labels"] = tags

        songtbList = response.xpath("//div[@class='n-songtb']/div")

        NumberOfSongs = songtbList[0].xpath(

            "./span[@class='sub s-fc3']/span[@id='playlist-track-count']/text()").extract_first()

        itemML["NumberOfSongs"] = NumberOfSongs

        AmountOfPlay = songtbList[0].xpath("./div[@class='more s-fc3']/strong[@id='play-count']/text()").extract_first()

        itemML["AmountOfPlay"] = AmountOfPlay

        # data = [itemML["SongsListID"], itemML["SongListName"], itemML["AmountOfPlay"], itemML["Labels"],

        #         itemML["Collection"],itemML["Forwarding"], itemML["Comment"],

        #                         itemML["NumberOfSongs"], itemML["CreationDate"], itemML["AuthorID"]]

        # print(data)

        data = [itemML['SongsListID'], itemML['SongListName'], itemML['AmountOfPlay'], itemML["Labels"],

                itemML['Collection'],

                itemML['Forwarding'], itemML['Comment'], itemML['NumberOfSongs'], itemML['CreationDate'],

                itemML['AuthorID']]

        # csv_writer.writerow(data)

        yield itemML  # 将爬取的信息传给 pipelines.py

from asyncio.log import logger

from itemadapter import ItemAdapter

import csv

import os

import logging

logger=logging.getLogger(__name__)

class WyymusicPipeline:

    def __init__(self):

        store_file = os.path.dirname(__file__) + '/Musiclist.csv'

        self.file = open(store_file, 'a+', encoding='utf-8', newline='')

        self.writer = csv.writer(self.file, dialect='excel')

        self.writer.writerow(['id','歌单名','播放量','标签名','收藏量','转发量','评论量','歌曲数量','创建日期','作者'])

    # 每次引擎将数据转发都会调用process_item方法,item参数就是转发过来的数据

    def process_item(self, itemML, spider):

        logger.warn('正在存储……')

        logger.warn(itemML)

        if itemML['SongListName']:

            data = [itemML['SongsListID'], itemML['SongListName'], itemML['AmountOfPlay'], itemML["Labels"], itemML['Collection'],

                    itemML['Forwarding'], itemML['Comment'],itemML['NumberOfSongs'], itemML['CreationDate'], itemML['AuthorID']]

            self.writer.writerow(data)

        return itemML

    def close_spider(self, spider):

        self.file.close()

import pandas as pd

import numpy as np

# pd.set_option('display.max_columns',None)

df=pd.read_csv('Musiclist.csv', encoding='utf-8')

# print(df)

f = df.sort_values(by=["播放量"], ascending=False)

f['评论量']=f['评论量'].replace('评论',0)#把评论量这一列出现评论的改为0

f.to_csv('list.csv',encoding='utf-8',index=False)

  1. 运行结果截图:

1原始数据

2原始数据

3清洗后

4 清洗后

5

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ଲଇଉକ ଲ ̊ଳ

多谢大哥赏赐

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值