一 、 实验题目
大作业:网易云音乐歌单爬取
爬取任一自己感兴趣的网页,获取不少于500条的数据并且进行数据清洗、数据统计、数据处理等。
二 、 实验目的和要求
大作业:
1 、 目的: 掌握爬虫基础语法
2 、要求: 掌握页面跳转,数据存储,scrapy和xpath等的运用
三 、 实验内容
- 代码
import scrapy
from gc import callbacks
from re import I
from urllib.parse import urljoin
import logging
logger=logging.getLogger(__name__)
# 导入深拷贝包,用于在爬取多个页面时保存到pipeline中的歌单信息顺序不会乱,防止出现重复,非常关键
from copy import deepcopy
from ..items import WyymusicItem
class MusicListSpider(scrapy.Spider):
name = "MusicList" # 必须要写name属性,在pipeline.py中会用到
allowed_domains = ["music.163.com"] # 设置爬虫爬取范围
start_urls = ["https://music.163.com/discover/playlist"] # 起始爬取的页面,即歌单第一面
offset = 0 # 自己设置的一个指针,用于记录当前爬取的页码
# f = open('MusicList.csv', 'a+', encoding='utf-8',newline='')
# csv_writer = csv.writer(f)
# csv_writer.writerow(['id号','歌单名','播放量','标签名','url','收藏量','转发量','评论量','歌曲数量','创建日期','作者'])
# f.close()
def parse(self, response):
# 使用.xpath语法来从HTML页面中解析需要的信息
# 获取一页中的全部歌单,保存到liList中
liList = response.xpath("//div[@id='m-disc-pl-c']/div/ul[@id='m-pl-container']/li")
# 对liList中的歌单,一个一个遍历,获取歌单详细页面的信息
for li in liList:
itemML = WyymusicItem()
a_href = li.xpath("./div/a[@class = 'msk']/@href").extract_first()
itemML["SongsListID"] = a_href[13:]
# 获取歌单详细页面的Url地址
Url = "https://music.163.com" + a_href
itemML["Url"] = Url
# 调用SongsListPageParse来获取歌单详细页面的信息
yield scrapy.Request(Url, callback=self.SongsListPageParse, meta={"itemML": deepcopy(itemML)})
# 爬取下一页
if self.offset < 37:
self.offset += 1
# 获取下一页的Url地址
nextpage_a_url = "https://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=" + str(self.offset * 35)
print(self.offset, nextpage_a_url)
yield scrapy.Request(nextpage_a_url, callback=self.parse)
print("开始爬下一页")
# 用于爬取每一个歌单中的详细页面信息
def SongsListPageParse(self, response):
# f = open('MusicList.csv', 'a+', encoding='utf-8',newline='')
# csv_writer = csv.writer(f)
cntc = response.xpath("//div[@class='cntc']")
itemML = response.meta["itemML"]
SongListName = cntc.xpath("./div[@class='hd f-cb']/div/h2//text()").extract_first()
itemML["SongListName"] = SongListName # 获取歌单名
user_url = cntc.xpath("./div[@class='user f-cb']/span[@class='name']/a/@href").extract_first()
user_id = user_url[14:]
itemML["AuthorID"] = user_id # 获取歌单创作者id号
time = cntc.xpath("./div[@class='user f-cb']/span[@class='time s-fc4']/text()").extract_first()
itemML["CreationDate"] = time[0:10] # 获取歌单创建日期
aList = cntc.xpath("./div[@id='content-operation']/a")
Collection = aList[2].xpath("./@data-count").extract_first()
itemML["Collection"] = Collection # 获取收藏量
Forwarding = aList[3].xpath("./@data-count").extract_first()
itemML["Forwarding"] = Forwarding # 获取转发量
Comment = aList[5].xpath("./i/span[@id='cnt_comment_count']/text()").extract_first()
itemML["Comment"] = Comment # 获取评论量
tags = ""
tagList = cntc.xpath("./div[@class='tags f-cb']/a")
for a in tagList:
tags = tags + a.xpath("./i/text()").extract_first() + " "
itemML["Labels"] = tags
songtbList = response.xpath("//div[@class='n-songtb']/div")
NumberOfSongs = songtbList[0].xpath(
"./span[@class='sub s-fc3']/span[@id='playlist-track-count']/text()").extract_first()
itemML["NumberOfSongs"] = NumberOfSongs
AmountOfPlay = songtbList[0].xpath("./div[@class='more s-fc3']/strong[@id='play-count']/text()").extract_first()
itemML["AmountOfPlay"] = AmountOfPlay
# data = [itemML["SongsListID"], itemML["SongListName"], itemML["AmountOfPlay"], itemML["Labels"],
# itemML["Collection"],itemML["Forwarding"], itemML["Comment"],
# itemML["NumberOfSongs"], itemML["CreationDate"], itemML["AuthorID"]]
# print(data)
data = [itemML['SongsListID'], itemML['SongListName'], itemML['AmountOfPlay'], itemML["Labels"],
itemML['Collection'],
itemML['Forwarding'], itemML['Comment'], itemML['NumberOfSongs'], itemML['CreationDate'],
itemML['AuthorID']]
# csv_writer.writerow(data)
yield itemML # 将爬取的信息传给 pipelines.py
from asyncio.log import logger
from itemadapter import ItemAdapter
import csv
import os
import logging
logger=logging.getLogger(__name__)
class WyymusicPipeline:
def __init__(self):
store_file = os.path.dirname(__file__) + '/Musiclist.csv'
self.file = open(store_file, 'a+', encoding='utf-8', newline='')
self.writer = csv.writer(self.file, dialect='excel')
self.writer.writerow(['id号','歌单名','播放量','标签名','收藏量','转发量','评论量','歌曲数量','创建日期','作者'])
# 每次引擎将数据转发都会调用process_item方法,item参数就是转发过来的数据
def process_item(self, itemML, spider):
logger.warn('正在存储……')
logger.warn(itemML)
if itemML['SongListName']:
data = [itemML['SongsListID'], itemML['SongListName'], itemML['AmountOfPlay'], itemML["Labels"], itemML['Collection'],
itemML['Forwarding'], itemML['Comment'],itemML['NumberOfSongs'], itemML['CreationDate'], itemML['AuthorID']]
self.writer.writerow(data)
return itemML
def close_spider(self, spider):
self.file.close()
import pandas as pd
import numpy as np
# pd.set_option('display.max_columns',None)
df=pd.read_csv('Musiclist.csv', encoding='utf-8')
# print(df)
f = df.sort_values(by=["播放量"], ascending=False)
f['评论量']=f['评论量'].replace('评论',0)#把评论量这一列出现评论的改为0
f.to_csv('list.csv',encoding='utf-8',index=False)
- 运行结果截图:
图 1原始数据
图 2原始数据
图 3清洗后
图 4 清洗后
图 5