数据集说明
在这个借助Goodbook
网站收集的数据集中,可以获得有关书籍的信息,如作者、页数、评分和其他信息
文件说明
books.csv
bookID,title,authors,average_rating,isbn,isbn13,language_code, num_pages,ratings_count,text_reviews_count,publication_date,publisher
1 ,Harry Potter and the Half-Blood Prince ( Harry Potter
2 ,Harry Potter and the Order of the Phoenix ( Harry Potter
4 ,Harry Potter and the Chamber of Secrets ( Harry Potter
.. .
业务需求
(1)统计最受关注的书籍Top 10
(2)统计书籍篇幅Top 10
(3)统计不同出版社出版的书籍数量
(4)统计不同语言的书籍数量
(5)统计最不受关注的高分书籍Top 10(评分4.5以上,评分人数1w以上,评论数200以下)
(6)统计不同年份出版的书籍数量
(7)统计不同作者的书籍的平均评分
(8)统计在最受关注的书籍Top 1000中,不同出版社出版的书籍数量
(9)统计在最受关注的书籍Top 1000中,不同语言的书籍数量
(10)统计不同作者的书籍的平均受关注程度
需求实现
数据预处理
import pandas as pd
import numpy as np
df = pd. read_csv( '../data/books.csv' , on_bad_lines= 'skip' )
df. columns = df. columns. str . strip( )
print ( df. head( 3 ) )
df. info( )
df = df. dropna( )
df. info( )
df = df. drop_duplicates( keep= 'first' )
df. info( )
def convert_date ( date_str) :
try :
converted_date = pd. to_datetime( date_str, format = '%m/%d/%Y' )
return converted_date. strftime( '%Y-%m-%d' )
except ValueError as e:
print ( f" {
date_str} 转换失败: {
e} " )
return np. nan
df[ 'publication_date' ] = df[ 'publication_date' ] . apply ( convert_date)
df = df. dropna( )
df. info( )
print ( df[ 'language_code' ] . unique( ) )
df. to_csv( '../data/books_cleaned.csv' , encoding= 'utf-8' , index= False )
数据统计分析
from pyspark import SparkConf
from pyspark. sql import SparkSession
from pyspark. sql. functions import date_format, split, rank
from pyspark. sql. window import Window
spark = SparkSession. builder. config( conf= SparkConf( ) ) . getOrCreate( )
df_books = spark. read. csv( '/input_spark_book_recommendation_analysis/books_cleaned.csv' , header= True , inferSchema= True )
df_books. show( 10 )
df_books. createOrReplaceTempView( 'books' )
"""
(1) 统计最受关注的书籍 Top 10
"""
df_books_attention_top_10 = spark. sql(
"""
select bookID, title, text_reviews_count, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, publication_date, publisher
from books
order by text_reviews_count desc
"""
)
df_books_attention_top_10 = df_books_attention_top_10. repartition( 1 )
df_books_attention_top_10. show( n= 10 , truncate= False )
df_books_attention_top_10. write. csv( '/result/books_attention_top_10.csv' ,
mode= 'overwrite' )
"""
(2) 统计书籍篇幅 Top 10
"""
df_books_length_top_10 = spark. sql(
"""
select bookID, title, num_pages, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, ratings_count, text_reviews_count, publication_date, publisher
from books
order by num_pages desc
"""
)
df_books_length_top_10 = df_books_length_top_10. repartition( 1 )
df_books_length_top_10. show( n= 10 , truncate= False )
df_books_length_top_10. write. csv( '/result/books_length_top_10.csv' , mode= 'overwrite' )
"""
(3) 统计不同出版社出版的书籍数量
"""
df_publisher_books_num = spark. sql(
"""
select publisher, count(*) as books_num
from books
group by publisher
order by books_num desc
"""
)
df_publisher_books_num = df_publisher_books_num. repartition( 1 )
df_publisher_books_num. show( n= 10 , truncate= False )
df_publisher_books_num. write. csv( '/result/publisher_books_num.csv' ,
mode= 'overwrite' )
"""
(4) 统计不同语言的书籍数量
"""
df_language_books_num = spark. sql(
"""
select language_code, count(*) as books_num
from books
group by language_code
order by books_num desc
"""
)
df_language_books_num = df_language_books_num. repartition( 1 )
df_language_books_num. show( n= 10 , truncate= False )
df_language_books_num. write. csv( '/result/language_books_num.csv' , mode= 'overwrite' )
"""
(5) 统计最不受关注的高分书籍 Top 10 (评分 4.5 以上, 评分人数 1w 以上, 评论数 200 以下)
"""
df_books_rating_no_attention_top_10 = spark. sql(
"""
select bookID, title, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, text_reviews_count, publication_date, publisher
from books
where average_rating > 4.5 and ratings_count > 10000 and text_reviews_count < 200
order by text_reviews_count asc
"""
)
df_books_rating_no_attention_top_10 = df_books_rating_no_attention_top_10. repartition( 1 )
df_books_rating_no_attention_top_10. show( n= 10 , truncate= False )
df_books_rating_no_attention_top_10. write. csv(
'/result/books_rating_no_attention_top_10.csv' , mode= 'overwrite' )
df_books_with_year = df_books. withColumn( 'year' , date_format( df_books