df = pd.read_csv(r'ted_main.csv')
df.keys()
df.dtypes
pd.isnull(df).sum() # check missing value
comments 0 description 0 duration 0 event 0 film_date 0 languages 0 main_speaker 0 name 0 num_speaker 0 published_date 0 ratings 0 related_talks 0 speaker_occupation 6 tags 0 title 0 url 0 views 0 dtype: int64
Great! There is very few missing value. (please remember this new approach to check missing data)
print ("First row with missing value") # find the missing value
for index, row in df.iterrows():
if pd.isnull(row['speaker_occupation']):
print (row)
break
First row with missing value comments 145 description After a crisis, how can we tell if water is sa... duration 217 event TEDGlobal 2011 film_date 1310601600 languages 38 main_speaker Sonaar Luthra name Sonaar Luthra: Meet the Water Canary num_speaker 1 published_date 1326731605 ratings [{'id': 10, 'name': 'Inspiring', 'count': 73},... related_talks [{'id': 523, 'hero': 'https://pe.tedcdn.com/im... speaker_occupation NaN tags ['TED Fellows', 'design', 'global development'... title Meet the Water Canary url https://www.ted.com/talks/sonaar_luthra_meet_t... views 353749 Name: 1113, dtype: object
#missing value first occurs on 1113rd row and show as NaN(standard missing value expression)
print (df['languages'][df['languages'] == 0].count()) # find value == 0
df[df['languages'] == 0].head()
df['Talk_ID'] = range(1, len(df)+1)
df.head()
df.columns
df['languages'] = df['languages'].replace(0, 1) # replace all '0' by '1'
print (df['languages'][df['languages'] == 0].count()) # find value == 0
df[df['languages'] == 0].head()
# format unix time stamp datetime
import datetime
df['film_date'] = df['film_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
df['published_date'] = df['published_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
pop_talks = df[['title', 'main_speaker', 'views', 'film_date']].sort_values('views', ascending=False)[:10] # sort record by views count
pop_talks
#check coorelation plot and value
sns.jointplot(x='views',y='comments', data = df)
df[['views', 'comments']].corr() # correlation
# count talk by year
df['year']= df['film_date'].apply(lambda x: x.split('-')[2]) # get year from date
year_df = pd.DataFrame(df['year'].value_counts().reset_index())
year_df.columns =['year','talks']
# make plot
df['year']= df['film_date'].apply(lambda x: x.split('-')[2]) # get year from date
year_df = pd.DataFrame(df['year'].value_counts().reset_index())
year_df.columns =['year','talks']
# count comment by speakers
speaker_df = df.groupby('main_speaker').count().reset_index()[['main_speaker','comments']]
speaker_df.columns = ['main_speaker','appearances']
speaker_df = speaker_df.sort_values('appearances', ascending = False)
speaker_df.head(10)
# make plot of speaker by occupation
occupation_df = df.groupby('speaker_occupation').count().reset_index()[['speaker_occupation', 'comments']]
occupation_df.columns = ['occupation','appearances']
occupation_df = occupation_df.sort_values('appearances', ascending = False)
plt.figure(figsize = (15,5))
sns.barplot(x= 'occupation',y = 'appearances', data = occupation_df.head(10))
plt.show()
# Separate multiple level tag
import ast
df['tags']= df['tags'].apply(lambda x: ast.literal_eval(x))
s = df.apply(lambda x : pd.Series(x['tags']), axis = 1).stack().reset_index(level=1, drop = True)
s.name = 'theme'
theme_df = df.drop('tags', axis =1). join(s)
ast.literal_eval
pop_themes = pd.DataFrame(theme_df['theme'].value_counts()).reset_index()
plt.figure(figsize=(15,5))
sns.barplot(x = 'theme', y ='talks', data = pop_themes.head(10))
plt.show()