google play store的app数据分析
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./googleplaystore.csv', usecols=(0, 1, 2, 3, 4, 5, 6))
df.head()
df.shape
df.count()
pd.unique(df['App']).size
df['Category'].value_counts(dropna=False)
df[df['Category'] == '1.9']
df['Rating'].value_counts(dropna=False)
df['Rating'].fillna(value=df['Rating'].mean(), inplace=True)
df['Reviews'].value_counts(dropna=False)
df['Reviews'].str.isnumeric().sum()
df[~df['Reviews'].str.isnumeric()]
df.drop(index=10472, inplace=True)
df['Reviews'] = df['Reviews'].astype('i8')
df['Size'].value_counts()
df['Size'] = df['Size'].str.replace('M', 'e+6')
df['Size'] = df['Size'].str.replace('k', 'e+3')
def is_convertable(v):
try:
float(v)
return True
except ValueError:
return False
temp = df['Size'].apply(is_convertable)
df['Size'][~temp].value_counts()
df['Size'] = df['Size'].str.replace('Varies with device', '0')
temp = df['Size'].apply(is_convertable)
df['Size'][~temp].value_counts()
df['Size'] = df['Size'].astype('f8')
df['Size'].replace(0, df['Size'].mean(), inplace=True)
df.describe()
df['Installs'].value_counts()
df['Installs'] = df['Installs'].str.replace('+', '')
df['Installs'] = df['Installs'].str.replace(',', '')
df['Installs'] = df['Installs'].astype('i8')
df.describe()
df['Type'].value_counts(dropna=False)
df[df['Type'].isnull()]
df.drop(index=9148, inplace=True)
df.drop_duplicates('App', inplace=True)
df.describe()
df.Category.unique().size
df.groupby('Category').count().sort_values('App', ascending=False)
df.groupby('Category').mean().sort_values('Installs', ascending=False)
df.groupby('Category').mean().sort_values('Reviews', ascending=False)
df.groupby('Category').mean().sort_values('Rating', ascending=False)
df.groupby('Type').count()
df.groupby('Type').sum().sort_values('Installs', ascending=False)
df.groupby(['Type', 'Category']).mean().sort_values('Reviews', ascending=False)
g = df.groupby(['Type', 'Category']).mean()
(g['Reviews'] / g['Installs']).sort_values(ascending=False)
的,0.3以上可以认为是弱相关)
df.corr()