[Clojure] Data Collection and Data Analysis on the music of www.xiami.com - Part 5

Source code

https://github.com/bluesilence/Lisp/tree/master/clojure/projects/xiami-crawler


Data Cooking

For simplicity, I chose SQL to analyse the data. The raw data in .log has to be structured before that.

.log(csv) -> Excel -> SQL Server

3 tables are built: Songs, Albums, Artists.

2 table is created: Genre, GenrePreference. The GenrePreference table ismanually modified to record my flavor on each genre. However, this table is obsolete since the latest data combines multiple genres of an album into 1 string...

INSERT INTO [xiami].[dbo].[Genre] 
(GenreName) 
SELECT DISTINCT 
AlbumGenre 
FROM [xiami].[dbo].[Albums$] 
 
 
INSERT INTO [xiami].[dbo].[GenrePreference] 
(UserId, GenreId, Preference) 
SELECT  
1 AS UserId, 
Id, 
0 AS Preference 
FROM [xiami].[dbo].[Genre]

2 functions are built: 

  • GetAlbumPreferenceByGenre

  • -- ============================================= 
    -- Author:	<Author,,Name> 
    -- Create date: <Create Date, ,> 
    -- Description: Obsolete since multiple genres are considered an independent genre 
    -- ============================================= 
    CREATE FUNCTION [dbo].[GetAlbumPreferenceByGenre] 
    ( 
    -- Add the parameters for the function here 
    @AlbumId decimal(18,4) 
    ) 
    RETURNS int 
    AS 
    BEGIN 
     DECLARE @genres [nvarchar](255); 
                DECLARE @preference int = 0; 
     
     SELECT @genres = AlbumGenre 
     FROM [dbo].[Albums$] 
     WHERE AlbumId = @AlbumId; 
     
     SELECT @preference = MAX(Preference) 
     FROM 
     ( 
    SELECT Item 
    FROM [dbo].[split] (@genres, ',') 
     ) a 
     JOIN 
     ( 
    SELECT GenreName, Preference 
    FROM [dbo].[GenrePreference] 
      ) b 
      ON a.Item = b.GenreName; 
     
      RETURN @preference; 
    END


  • GetAlbumPreferenceByUser
-- ============================================= 
-- Author:	<Author,,Name> 
-- Create date: <Create Date, ,> 
-- Description:	Get Album Preference By UserId 
-- ============================================= 
CREATE FUNCTION [dbo].[GetAlbumPreferenceByUser] 
( 
-- Add the parameters for the function here 
@AlbumId decimal(18,4), 
@UserId int 
) 
RETURNS int 
AS 
BEGIN 
 
 DECLARE @genre [nvarchar](512); 
 
 SELECT @genre = AlbumGenre 
 FROM [dbo].[Albums$] 
 WHERE AlbumId = @AlbumId; 
 
 DECLARE @genreId int; 
 
 SELECT @genreId = Id 
 FROM [dbo].[Genre] 
 WHERE GenreName = @genre; 
 
     DECLARE @preference int = 0; 
 
 SELECT @preference = Preference 
 FROM [dbo].[UserGenrePreference] 
 WHERE UserId = @UserId 
 AND GenreId = @genreId 
 
 RETURN @preference; 
END


Then we can base on the tabular tables to do some primary queries.


Data Analysis

  • Q: Which artist has the max genres?

SELECT ArtistName, COUNT(DISTINCT AlbumGenre) AS Genres 
FROM 
( 
SELECT ArtistId, ArtistName 
FROM dbo.Artists$ 
) artists 
JOIN 
( 
SELECT AlbumId, AlbumName, AlbumGenre, ArtistId 
FROM dbo.Albums$ 
WHERE AlbumGenre != 'N/A' 
) albums 
ON artists.ArtistId = albums.ArtistId 
GROUP BY ArtistName 
ORDER BY Genres DESC, ArtistName

  • Q: Which artist has the most number of albums?

SELECT ArtistName, COUNT(AlbumId) AS Albums 
FROM 
( 
SELECT ArtistId, ArtistName 
FROM dbo.Artists$ 
GROUP BY ArtistId, ArtistName 
) artists 
JOIN 
( 
SELECT AlbumId, AlbumName, ArtistId 
FROM dbo.Albums$ 
WHERE AlbumGenre != 'N/A' 
GROUP BY AlbumId, AlbumName, ArtistId 
) albums 
ON artists.ArtistId = albums.ArtistId 
GROUP BY ArtistName 
ORDER BY Albums DESC, ArtistName 

  • Q: Hotness of artists?

SELECT a.ArtistId, ArtistName, SUM(Hotness) AS Hotness 
FROM 
( 
SELECT ArtistId, ArtistName 
FROM dbo.Artists$ 
) a 
JOIN 
( 
SELECT ArtistId, albums.AlbumId AS AlbumId, AlbumName, Hotness 
FROM 
( 
SELECT AlbumId, AlbumName, ArtistId 
FROM dbo.Albums$ 
GROUP BY AlbumId, AlbumName, ArtistId 
) albums 
JOIN 
( 
SELECT SUM(SongHot) AS Hotness, AlbumId 
FROM dbo.Songs$ 
GROUP BY AlbumId 
) songs 
ON albums.AlbumId = songs.AlbumId 
) b 
ON a.ArtistId = b.ArtistId 
GROUP BY a.ArtistId, ArtistName 
ORDER BY Hotness DESC, ArtistName

  • Q: Which artist has the max hotness per album?

SELECT 
artistHotness.ArtistId, 
artistHotness.ArtistName, 
Hotness * 1.0 / Albums AS HotnessPerAlbum 
FROM 
( 
SELECT artists.ArtistId, COUNT(AlbumId) AS Albums 
FROM 
( 
SELECT ArtistId, ArtistName 
FROM dbo.Artists$ 
GROUP BY ArtistId, ArtistName 
) artists 
JOIN 
( 
SELECT AlbumId, AlbumName, ArtistId 
FROM dbo.Albums$ 
WHERE AlbumGenre != 'N/A' 
GROUP BY AlbumId, AlbumName, ArtistId 
) albums 
ON artists.ArtistId = albums.ArtistId 
GROUP BY artists.ArtistId 
)  
albumCount 
JOIN 
( 
SELECT a.ArtistId, ArtistName, SUM(Hotness) AS Hotness 
FROM 
( 
SELECT ArtistId, ArtistName 
FROM dbo.Artists$ 
) a 
JOIN 
( 
SELECT ArtistId, albums.AlbumId AS AlbumId, AlbumName, Hotness 
FROM 
( 
SELECT AlbumId, AlbumName, ArtistId 
FROM dbo.Albums$ 
GROUP BY AlbumId, AlbumName, ArtistId 
) albums 
JOIN 
( 
SELECT SUM(SongHot) AS Hotness, AlbumId 
FROM dbo.Songs$ 
GROUP BY AlbumId 
) songs 
ON albums.AlbumId = songs.AlbumId 
) b 
ON a.ArtistId = b.ArtistId 
GROUP BY a.ArtistId, ArtistName 
) 
artistHotness 
ON artistHotness.ArtistId = albumCount.ArtistId 
ORDER BY HotnessPerAlbum DESC 

Note: 

Because there are records at different DataTime, we need to use GROUP BY to de-dup before JOIN.

  • Q: Which album makes the biggest comment/collected ratio?

SELECT 
  *, 
  AlbumComments * 1.0 / AlbumCollected AS Ratio 
FROM [xiami].[dbo].[Albums$] 
WHERE AlbumCollected != 0 
AND AlbumValue > 9 
AND AlbumCollected > 300 
ORDER BY Ratio DESC

  • Q: Which album has the highest hot/collected conversion ratio?

SELECT 
  a.AlbumId AS AlbumId, 
  a.AlbumName AS AlbumName, 
  a.AlbumValue AS AlbumValue, 
  a.AlbumComments AS AlbumComments, 
  a.AlbumCollected AS AlbumCollected, 
  SUM(b.SongHot) AS AlbumHot, 
  a.AlbumCollected * 1.0 / SUM(b.SongHot) AS ConvertRatio 
FROM 
( 
SELECT 
  * 
FROM [xiami].[dbo].[Albums$] 
) a 
JOIN 
( 
SELECT * 
FROM [xiami].[dbo].[Songs$] 
) b 
ON a.AlbumId = b.AlbumId 
WHERE SongHot > 0 
AND AlbumValue > 0 
GROUP BY a.AlbumId, AlbumName, AlbumValue, AlbumComments, AlbumCollected 
ORDER BY ConvertRatio DESC
  • Q: Which genre of music is the most popular?

By SongHot:

SELECT 
  a.AlbumGenre AS AlbumGenre, 
  SUM(b.SongHot) AS AlbumHot 
FROM 
( 
SELECT 
  * 
FROM [xiami].[dbo].[Albums$] 
) a 
JOIN 
( 
SELECT * 
FROM [xiami].[dbo].[Songs$] 
) b 
ON a.AlbumId = b.AlbumId 
WHERE AlbumValue > 0 
AND AlbumGenre != 'N/A' 
GROUP BY AlbumGenre 
ORDER BY AlbumHot DESC

By  AlbumCollected :

SELECT 
AlbumGenre, 
SUM(AlbumCollected) AS Collected 
FROM [xiami].[dbo].[Albums$] 
WHERE AlbumValue > 0 
AND AlbumGenre != 'N/A' 
GROUP BY AlbumGenre 
ORDER BY Collected DESC

By  AlbumComments :

SELECT 
  AlbumGenre, 
  SUM(AlbumComments) AS Comments 
FROM [xiami].[dbo].[Albums$] 
WHERE AlbumValue > 0 
AND AlbumGenre != 'N/A' 
GROUP BY AlbumGenre 
ORDER BY Comments DESC
  • Q: Get the top 10 representative songs for each genre.

SELECT TOP 50 
SongName, ArtistName 
FROM 
( 
SELECT SongId, SongName, SongHot, ArtistId, AlbumGenre, AlbumValue, AlbumCollected, AlbumComments 
FROM 
(SELECT 
  SongId, 
  SongName, 
  SongHot, 
  songs.AlbumId AS AlbumId, 
  albums.ArtistId AS ArtistId, 
  AlbumGenre, 
  AlbumValue, 
  AlbumCollected, 
  AlbumComments, 
  Rank() OVER (PARTITION BY AlbumGenre ORDER BY SongHot DESC, AlbumCollected DESC, AlbumValue DESC, AlbumComments DESC) AS r 
FROM 
( 
  ( 
  SELECT 
   SongId, 
   SongName, 
   SongHot, 
   AlbumId 
  FROM [xiami].[dbo].[Songs$] 
  ) songs 
  JOIN 
  ( 
SELECT 
AlbumId, 
AlbumGenre, 
AlbumValue, 
AlbumCollected, 
AlbumComments, 
ArtistId 
FROM [xiami].[dbo].[Albums$] 
WHERE AlbumValue > 0 
AND AlbumGenre != 'N/A' 
  ) albums 
  ON songs.AlbumId = albums.AlbumId 
 ) 
) results 
WHERE r <= 10 
) tmp 
JOIN 
( 
SELECT * 
FROM [xiami].[dbo].[Artists$] 
) artists 
ON tmp.ArtistId = artists.ArtistId

  • Q: Which song is my taste? 

A: Save query results as tsv, then it can be imported into xiami collection. 

Xiami collection only allows a maximum of 50 songs.

SELECT SongName, ArtistName, Preference 
FROM 
( 
SELECT 
SongId, 
SongName, 
SongHot, 
a.AlbumId AS AlbumId, 
AlbumName, 
AlbumGenre, 
c.ArtistName AS ArtistName, 
AlbumValue, 
AlbumComments, 
AlbumCollected, 
[dbo].[GetAlbumPreferenceByUser] (a.AlbumId, 1) AS Preference 
FROM 
( 
SELECT * 
FROM [xiami].[dbo].[Songs$] 
) a 
JOIN 
( 
  SELECT * 
  FROM [xiami].[dbo].[Albums$] 
  WHERE AlbumValue > 9.3 
  AND AlbumCollected > 300 
  AND AlbumComments > 100 
  AND ((AlbumComments * 1.0) / AlbumCollected) > 0.05 
) b 
ON a.AlbumId = b.AlbumId 
JOIN 
( 
  SELECT * 
  FROM [xiami].[dbo].[Artists$] 
) c 
ON b.ArtistId = c.ArtistId 
) final 
WHERE Preference > 0 
ORDER BY Preference DESC 

Note:

The constants of the queries above are arbitrary.

Just out of my preference : P

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值