收集
import numpy as np
import pandas as pd
import json
import matplotlib. pyplot as plt
import seaborn as sb
import requests
import os
folder_name = 'image_predictions'
if not os. path. exists( folder_name) :
os. makedirs( folder_name)
image_predictions_urls = [ 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv' ]
for url in image_predictions_urls:
response = requests. get( url)
with open ( os. path. join( folder_name, url. split( '/' ) [ - 1 ] ) , mode = 'wb' ) as file :
file . write( response. content)
twitter_archive = pd. read_csv( 'twitter-archive-enhanced.csv' )
image_predictions = pd. read_csv( 'image-predictions.tsv' , sep = '\t' )
with open ( 'tweet_json.txt' , 'r' ) as f:
tweets_list = [ ]
for line in f:
tweets_list. append( json. loads( line) )
tweet_json = pd. DataFrame( tweets_list)
评估
目测评估
pd. options. display. max_columns= 1000
pd. set_option( 'max_colwidth' , 200 )
twitter_archive. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id 2356 non-null int64
in_reply_to_status_id 78 non-null float64
in_reply_to_user_id 78 non-null float64
timestamp 2356 non-null object
source 2356 non-null object
text 2356 non-null object
retweeted_status_id 181 non-null float64
retweeted_status_user_id 181 non-null float64
retweeted_status_timestamp 181 non-null object
expanded_urls 2297 non-null object
rating_numerator 2356 non-null int64
rating_denominator 2356 non-null int64
name 2356 non-null object
doggo 2356 non-null object
floofer 2356 non-null object
pupper 2356 non-null object
puppo 2356 non-null object
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB
twitter_archive. sample( 5 )
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo 605 798576900688019456 NaN NaN 2016-11-15 17:22:24 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> RT @dog_rates: Not familiar with this breed. No tail (weird). Only 2 legs. Doesn't bark. Surprisingly quick. Shits eggs. 1/10 https://t.co/… 6.661041e+17 4.196984e+09 2015-11-16 04:02:55 +0000 https://twitter.com/dog_rates/status/666104133288665088/photo/1 1 10 None None None None None 832 768596291618299904 NaN NaN 2016-08-24 23:50:10 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> Say hello to Oakley and Charlie. They're convinced that they each have their own stick. Nobody tell them. Both 12/10 https://t.co/J2AJdyxglH NaN NaN NaN https://twitter.com/dog_rates/status/768596291618299904/photo/1 12 10 Oakley None None None None 134 866686824827068416 NaN NaN 2017-05-22 16:06:55 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Lili. She can't believe you betrayed her with bath time. Never looking you in the eye again. 12/10 would puppologize profusely https://t.co/9b9J46E86Z NaN NaN NaN https://twitter.com/dog_rates/status/866686824827068416/photo/1,https://twitter.com/dog_rates/status/866686824827068416/photo/1 12 10 Lili None None None None 2004 672466075045466113 NaN NaN 2015-12-03 17:23:00 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Franq and Pablo. They're working hard getting ready for Christmas. 12/10 for both. Amazing pups https://t.co/8lKFBOQ2J5 NaN NaN NaN https://twitter.com/dog_rates/status/672466075045466113/photo/1 12 10 Franq None None None None 1989 672828477930868736 NaN NaN 2015-12-04 17:23:04 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Jerry. He's a Timbuk Slytherin. Eats his pizza from the side first. Crushed that cup with his bare paws 9/10 https://t.co/fvxHL6cRRs NaN NaN NaN https://twitter.com/dog_rates/status/672828477930868736/photo/1 9 10 Jerry None None None None
image_predictions. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id 2075 non-null int64
jpg_url 2075 non-null object
img_num 2075 non-null int64
p1 2075 non-null object
p1_conf 2075 non-null float64
p1_dog 2075 non-null bool
p2 2075 non-null object
p2_conf 2075 non-null float64
p2_dog 2075 non-null bool
p3 2075 non-null object
p3_conf 2075 non-null float64
p3_dog 2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
image_predictions. head( )
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog 0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True 1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True 2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True 3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True 4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True
tweet_json. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2352 entries, 0 to 2351
Data columns (total 31 columns):
contributors 0 non-null object
coordinates 0 non-null object
created_at 2352 non-null object
display_text_range 2352 non-null object
entities 2352 non-null object
extended_entities 2073 non-null object
favorite_count 2352 non-null int64
favorited 2352 non-null bool
full_text 2352 non-null object
geo 0 non-null object
id 2352 non-null int64
id_str 2352 non-null object
in_reply_to_screen_name 78 non-null object
in_reply_to_status_id 78 non-null float64
in_reply_to_status_id_str 78 non-null object
in_reply_to_user_id 78 non-null float64
in_reply_to_user_id_str 78 non-null object
is_quote_status 2352 non-null bool
lang 2352 non-null object
place 1 non-null object
possibly_sensitive 2211 non-null object
possibly_sensitive_appealable 2211 non-null object
quoted_status 28 non-null object
quoted_status_id 29 non-null float64
quoted_status_id_str 29 non-null object
retweet_count 2352 non-null int64
retweeted 2352 non-null bool
retweeted_status 177 non-null object
source 2352 non-null object
truncated 2352 non-null bool
user 2352 non-null object
dtypes: bool(4), float64(3), int64(3), object(21)
memory usage: 505.4+ KB
tweet_json. sample( )
contributors coordinates created_at display_text_range entities extended_entities favorite_count favorited full_text geo id id_str in_reply_to_screen_name in_reply_to_status_id in_reply_to_status_id_str in_reply_to_user_id in_reply_to_user_id_str is_quote_status lang place possibly_sensitive possibly_sensitive_appealable quoted_status quoted_status_id quoted_status_id_str retweet_count retweeted retweeted_status source truncated user 259 None None Fri Mar 17 15:51:22 +0000 2017 [0, 143] {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/fvGkIuAlFK', 'expanded_url': 'https://www.gofundme.com/get-indie-home/', 'display_url': 'gofundme.com/get-indie-... {'media': [{'id': 842765306540052480, 'id_str': '842765306540052480', 'indices': [144, 167], 'media_url': 'http://pbs.twimg.com/media/C7IalMVX0AATKRD.jpg', 'media_url_https': 'https://pbs.twimg.co... 7292 False Meet Indie. She's not a fan of baths but she's definitely a fan of hide & seek. 12/10 click the link to help Indie\n\nhttps://t.co/fvGkIuAlFK https://t.co/kiCFtmJd7l None 842765311967449089 842765311967449089 None NaN None NaN None False en None False False NaN NaN NaN 1435 False NaN <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> False {'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ...
编程评估
twitter_archive. name. value_counts( ) . head( 5 )
None 745
a 55
Charlie 12
Cooper 11
Oliver 11
Name: name, dtype: int64
twitter_archive. rating_denominator. value_counts( ) . head( 10 )
10 2333
11 3
50 3
80 2
20 2
2 1
16 1
40 1
70 1
15 1
Name: rating_denominator, dtype: int64
twitter_archive[ ( twitter_archive. doggo == 'doggo' ) & ( twitter_archive. floofer == 'floofer' ) ]
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo 200 854010172552949760 NaN NaN 2017-04-17 16:34:26 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https://t.co/TXdT3tmuYk NaN NaN NaN https://twitter.com/dog_rates/status/854010172552949760/photo/1,https://twitter.com/dog_rates/status/854010172552949760/photo/1 11 10 None doggo floofer None None
sum ( image_predictions. jpg_url. duplicated( ) )
66
image_predictions. p1. value_counts( ) . head( 10 )
golden_retriever 150
Labrador_retriever 100
Pembroke 89
Chihuahua 83
pug 57
chow 44
Samoyed 43
toy_poodle 39
Pomeranian 38
cocker_spaniel 30
Name: p1, dtype: int64
image_predictions. p2. value_counts( ) . head( 10 )
Labrador_retriever 104
golden_retriever 92
Cardigan 73
Chihuahua 44
Pomeranian 42
French_bulldog 41
Chesapeake_Bay_retriever 41
toy_poodle 37
cocker_spaniel 34
miniature_poodle 33
Name: p2, dtype: int64
image_predictions. p3. value_counts( ) . head( 10 )
Labrador_retriever 79
Chihuahua 58
golden_retriever 48
Eskimo_dog 38
kelpie 35
kuvasz 34
chow 32
Staffordshire_bullterrier 32
beagle 31
cocker_spaniel 31
Name: p3, dtype: int64
质量
twitter_archive
timestamp的数据类型错误。 tweet_id列为int64,是错误的,应该是字符串才对。(image_predictions也出现同样情况) retweeted_status_id为转发用户,有181条,是转发内容,我们只需要含有图片的原始评级。 source列包含html文本内容,需要删除。(tweet_json也出现同样情况) name列出现55个a的名字,应该是提取错了。 评分标准(分母)不全为10或10的倍数,出现其他异常的数值。 expanded_urls有59条是空的。我们需要的是含有图片的原始评级。
image_predictions
图片链接有66条重复的,是转发内容,我们只需要含有图片的原始评级。 image_predictions表中p1、p2、p3列应都把首字母改成大写。
整洁度
twitter_archive表中doggo、floofer、pupper、puppo属于类型变量,应该合并为一列。 三个数据集都有tweet_id,根据tidy data的第3个规则:观察单位按表格组织(即:一个种类的观察形成一个单独的表格),而这个项目里的3份数据内容实际都是围绕dog rating这一个观察主题,放在一个表格里才符合tidy data的要求。
注意点
应该先开始清理转发和无图片数据,然后再进行接下来其他的清理,以避免不合理清理误删数据。所以我们应该先把三个表格进行合并,再把重复的retweeted_status_id、twitter_id、图片链接等进行删除。
清理
twitter_archive_clean = twitter_archive. copy( )
image_predictions_clean = image_predictions. copy( )
tweet_json_clean = tweet_json. copy( )
合并三个表格并清理重复值
定义
用merge将三个表格进行合并,并注意测试是否还存在重复值,如果存在,进一步进行删除。
代码
tweet_json_clean. rename( columns = { 'id_str' : 'tweet_id' } , inplace= True )
tweet_json_clean[ 'tweet_id' ] = tweet_json_clean. tweet_id. astype( int )
tweet_json_clean = tweet_json_clean[ [ 'tweet_id' , 'retweet_count' , 'favorite_count' ] ]
dog_clean = twitter_archive_clean. merge( image_predictions_clean, how= 'inner' , on= 'tweet_id' ) . merge( tweet_json_clean, how= 'inner' , on= 'tweet_id' )
dog_clean = dog_clean[ dog_clean. retweeted_status_id. isnull( ) ]
dog_clean. drop( dog_clean[ [ 'retweeted_status_id' , 'retweeted_status_user_id' , 'retweeted_status_timestamp' ] ] , axis= 1 , inplace= True )
测试
dog_clean. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1994 entries, 0 to 2072
Data columns (total 27 columns):
tweet_id 1994 non-null int64
in_reply_to_status_id 23 non-null float64
in_reply_to_user_id 23 non-null float64
timestamp 1994 non-null object
source 1994 non-null object
text 1994 non-null object
expanded_urls 1994 non-null object
rating_numerator 1994 non-null int64
rating_denominator 1994 non-null int64
name 1994 non-null object
doggo 1994 non-null object
floofer 1994 non-null object
pupper 1994 non-null object
puppo 1994 non-null object
jpg_url 1994 non-null object
img_num 1994 non-null int64
p1 1994 non-null object
p1_conf 1994 non-null float64
p1_dog 1994 non-null bool
p2 1994 non-null object
p2_conf 1994 non-null float64
p2_dog 1994 non-null bool
p3 1994 non-null object
p3_conf 1994 non-null float64
p3_dog 1994 non-null bool
retweet_count 1994 non-null int64
favorite_count 1994 non-null int64
dtypes: bool(3), float64(5), int64(6), object(13)
memory usage: 395.3+ KB
timestamp的数据类型错误
定义
将string转成timestamp类型
代码
dog_clean[ 'timestamp' ] = pd. to_datetime( dog_clean. timestamp)
测试
dog_clean. sample( 1 )
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog retweet_count favorite_count 33 885167619883638784 NaN NaN 2017-07-12 16:03:00 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> Here we have a corgi undercover as a malamute. Pawbably doing important investigative work. Zero control over tongue happenings. 13/10 https://t.co/44ItaMubBf https://twitter.com/dog_rates/status/885167619883638784/photo/1,https://twitter.com/dog_rates/status/885167619883638784/photo/1,https://twitter.com/dog_rates/status/885167619883638784/photo/1,http... 13 10 None None None None None https://pbs.twimg.com/media/DEi_N9qXYAAgEEw.jpg 4 malamute 0.812482 True Siberian_husky 0.071712 True Eskimo_dog 0.05577 True 4526 22304
tweet_id列为int64,是错误的,应该是字符串才对
定义
转换数据类型,将int64转换成string。
代码
dog_clean[ 'tweet_id' ] = dog_clean[ 'tweet_id' ] . astype( 'str' )
测试
type ( dog_clean[ 'tweet_id' ] [ 0 ] )
str
source列包含html文本内容,需要删除
定义
用extract方法进行删除。
代码
dog_clean. source = dog_clean. source. str . extract( '>(.+?)<' , expand = True )
测试
dog_clean. source. value_counts( )
Twitter for iPhone 1955
Twitter Web Client 28
TweetDeck 11
Name: source, dtype: int64
name列出现55个a的名字,应该是提取错了
定义
用extract方法从text列中重新查找提取宠物狗狗的名字。
代码
dog_clean[ 'name' ] = dog_clean. text. str . extract( '(?:This is|Here we have a|Meet)\s([A-Z][^\s.,]*)' , expand = True )
dog_clean[ 'name' ] = dog_clean[ 'name' ] . fillna( 'N/A' )
测试
dog_clean. name. value_counts( ) . head( 5 )
N/A 735
Charlie 10
Tucker 9
Lucy 9
Cooper 9
Name: name, dtype: int64
sum ( dog_clean. name. isnull( ) )
0
评分标准(分母)不全为10或者10的倍数,出现其他异常的数值
定义
一只狗狗的分母是10,有2只应该就是20,以此类推。这些异常值应该是输入缺少0导致的,或者提取的时候没把0提取到。需要针对异常值到原本的text里面去查看。以此判断原本应该的数值。如果文本本身就是异常的,那应该根据异常值去推断一个相对合理的值。
代码
rating = dog_clean. text. str . extract( '((?:\d+\.)?\d+)\/(\d+)' , expand= True )
rating. columns = [ 'rating_numerator' , 'rating_denominator' ]
dog_clean[ 'rating_numerator' ] = rating[ 'rating_numerator' ] . astype( float )
dog_clean[ 'rating_denominator' ] = rating[ 'rating_denominator' ] . astype( float )
dog_clean. rating_denominator. value_counts( )
10.0 1976
50.0 3
80.0 2
11.0 2
130.0 1
170.0 1
150.0 1
2.0 1
120.0 1
110.0 1
40.0 1
90.0 1
20.0 1
7.0 1
70.0 1
Name: rating_denominator, dtype: int64
rating_text = dog_clean[ [ 'text' , 'rating_denominator' ] ]
rating_text[ rating_text[ 'rating_denominator' ] . isin( [ 11.0 ] ) ]
text rating_denominator 876 After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ 11.0 1405 This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5 11.0
rating_text[ rating_text[ 'rating_denominator' ] . isin( [ 2.0 ] ) ]
text rating_denominator 2052 This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv 2.0
rating_text[ rating_text[ 'rating_denominator' ] . isin( [ 7.0 ] ) ]
text rating_denominator 414 Meet Sam. She smiles 24/7 & secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx 7.0
dog_clean. loc[ dog_clean[ 'rating_denominator' ] == 11 , 'rating_denominator' ] = 10
dog_clean. loc[ dog_clean[ 'rating_denominator' ] == 2 , 'rating_denominator' ] = 10
dog_clean. loc[ dog_clean[ 'rating_denominator' ] == 7 , 'rating_denominator' ] = 70
测试
dog_clean. rating_denominator. value_counts( )
10.0 1979
50.0 3
80.0 2
70.0 2
130.0 1
150.0 1
120.0 1
110.0 1
40.0 1
90.0 1
20.0 1
170.0 1
Name: rating_denominator, dtype: int64
image_predictions表中p1、p2、p3列的首字母有的大写有的小写
定义
p1、p2、p3列应都把首字母改成大写
代码
dog_clean[ 'p1' ] = dog_clean[ 'p1' ] . str . title( )
dog_clean[ 'p2' ] = dog_clean[ 'p2' ] . str . title( )
dog_clean[ 'p3' ] = dog_clean[ 'p3' ] . str . title( )
测试
dog_clean. p1. value_counts( ) . head( 10 )
Golden_Retriever 139
Labrador_Retriever 95
Pembroke 88
Chihuahua 79
Pug 54
Chow 41
Samoyed 40
Toy_Poodle 38
Pomeranian 38
Malamute 29
Name: p1, dtype: int64
twitter_archive表中doggo、floofer、pupper、puppo属于类型变量。
定义
重新提取并化为一列
代码
stage = dog_clean[ [ 'tweet_id' , 'doggo' , 'floofer' , 'pupper' , 'puppo' ] ]
stage_replace = stage. replace( { 'None' : 0 , 'doggo' : 1 , 'floofer' : 1 , 'pupper' : 1 , 'puppo' : 1 } )
stage_replace. sum ( )
tweet_id inf
doggo 74.000000
floofer 8.000000
pupper 212.000000
puppo 23.000000
dtype: float64
final_stage = stage_replace. melt( 'tweet_id' , var_name = 'stage' ) . query( 'value == 1' ) . drop( columns= [ 'value' ] )
final_stage[ final_stage. duplicated( 'tweet_id' ) ]
tweet_id stage 2148 854010172552949760 floofer 4328 817777686764523521 pupper 4385 808106460588765185 pupper 4407 802265048156610565 pupper 4413 801115127852503040 pupper 4498 785639753186217984 pupper 4640 759793422261743616 pupper 4692 751583847268179968 pupper 4783 741067306818797568 pupper 4829 733109485275860992 pupper 6130 855851453814013952 puppo
dog_clean = dog_clean. merge( final_stage, how= 'left' , on= 'tweet_id' )
dog_clean = dog_clean. drop( dog_clean[ [ 'doggo' , 'floofer' , 'pupper' , 'puppo' ] ] , axis= 1 )
测试
dog_clean[ 'stage' ] . value_counts( )
pupper 212
doggo 74
puppo 23
floofer 8
Name: stage, dtype: int64
dog_clean. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2005 entries, 0 to 2004
Data columns (total 24 columns):
tweet_id 2005 non-null object
in_reply_to_status_id 24 non-null float64
in_reply_to_user_id 24 non-null float64
timestamp 2005 non-null datetime64[ns]
source 2005 non-null object
text 2005 non-null object
expanded_urls 2005 non-null object
rating_numerator 2005 non-null float64
rating_denominator 2005 non-null float64
name 2005 non-null object
jpg_url 2005 non-null object
img_num 2005 non-null int64
p1 2005 non-null object
p1_conf 2005 non-null float64
p1_dog 2005 non-null bool
p2 2005 non-null object
p2_conf 2005 non-null float64
p2_dog 2005 non-null bool
p3 2005 non-null object
p3_conf 2005 non-null float64
p3_dog 2005 non-null bool
retweet_count 2005 non-null int64
favorite_count 2005 non-null int64
stage 317 non-null object
dtypes: bool(3), datetime64[ns](1), float64(7), int64(3), object(10)
memory usage: 350.5+ KB
存储清理后的主数据集
dog_clean. to_csv( 'twitter_archive_master.csv' , index= False )
分析和可视化
提出问题:
点赞数最多的狗狗是哪种? Twiiter上面根据图片预测出来的狗狗种类最多的是哪种? 哪个宠物名使用频率最高? 狗狗的评分是否与点赞数相关?
twitter_archive_master = pd. read_csv( 'twitter_archive_master.csv' )
twitter_archive_master. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2005 entries, 0 to 2004
Data columns (total 24 columns):
tweet_id 2005 non-null int64
in_reply_to_status_id 24 non-null float64
in_reply_to_user_id 24 non-null float64
timestamp 2005 non-null object
source 2005 non-null object
text 2005 non-null object
expanded_urls 2005 non-null object
rating_numerator 2005 non-null float64
rating_denominator 2005 non-null float64
name 1263 non-null object
jpg_url 2005 non-null object
img_num 2005 non-null int64
p1 2005 non-null object
p1_conf 2005 non-null float64
p1_dog 2005 non-null bool
p2 2005 non-null object
p2_conf 2005 non-null float64
p2_dog 2005 non-null bool
p3 2005 non-null object
p3_conf 2005 non-null float64
p3_dog 2005 non-null bool
retweet_count 2005 non-null int64
favorite_count 2005 non-null int64
stage 317 non-null object
dtypes: bool(3), float64(7), int64(4), object(10)
memory usage: 334.9+ KB
twitter_archive_master. head( 1 )
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text expanded_urls rating_numerator rating_denominator name jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog retweet_count favorite_count stage 0 892420643555336193 NaN NaN 2017-08-01 16:23:56 Twitter for iPhone This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU https://twitter.com/dog_rates/status/892420643555336193/photo/1 13.0 10.0 Phineas https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1 Orange 0.097049 False Bagel 0.085851 False Banana 0.07611 False 8842 39492 NaN
点赞数最多的狗狗是哪种?
varieties = twitter_archive_master[ [ 'tweet_id' , 'p1' , 'p1_dog' , 'p2' , 'p2_dog' , 'p3' , 'p3_dog' , 'favorite_count' ] ]
which_kind= [ ]
for kind in varieties. index:
if varieties. p1_dog. loc[ kind] == True :
which_kind. append( varieties. p1. loc[ kind] )
elif varieties. p2_dog. loc[ kind] == True :
which_kind. append( varieties. p2. loc[ kind] )
elif varieties. p3_dog. loc[ kind] == True :
which_kind. append( varieties. p3. loc[ kind] )
else :
which_kind. append( np. nan)
varieties[ 'dog_kind' ] = which_kind
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
favorite = varieties. groupby( 'dog_kind' ) [ 'favorite_count' ] . sum ( ) . sort_values( ascending = False ) . head( 5 ) . reset_index( )
sb. barplot( x = favorite. dog_kind, y = favorite[ 'favorite_count' ] , color = '#48D3D3' )
plt. title( 'What kind of dog with the most favorite?' )
plt. xticks( rotation = 45 )
plt. show( )
Twiiter上面根据图片预测出来的狗狗种类最多的是哪种?
Count_most = varieties. groupby( 'dog_kind' ) [ 'tweet_id' ] . sum ( ) . sort_values( ascending = False ) . head( 5 ) . reset_index( )
sb. barplot( x = favorite. dog_kind, y = Count_most[ 'tweet_id' ] , color = '#48D3D3' )
plt. title( 'What kind of dog predict most?' )
plt. xticks( rotation = 45 )
plt. show( )
哪个宠物名使用频率最高?
dog_order = twitter_archive_master[ 'name' ] . value_counts( ) . head( 5 ) . index
sb. countplot( data = twitter_archive_master, x = 'name' , color = '#48D3D3' , order = dog_order)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7f0ced8d68>
Correlation = twitter_archive_master[ [ 'rating_numerator' , 'favorite_count' ] ]
Correlation = Correlation[ Correlation. rating_numerator < 15 ]
sb. set_style( "white" )
% matplotlib inline
sb. regplot( x = 'rating_numerator' , y = 'favorite_count' , data = Correlation, color = '#48D3D3' )
plt. show( )
结论
1.从第一个问题的点赞数和第二个问题的预测种类数来看,Golden_Retriver这种狗狗最受人欢迎,大部分网友也喜欢把这种狗狗的图片放到Twitter; 2.宠物名使用频率最高的是Charlie,但是Cooper、Oliver、Lucy的名字使用的频率也很高,如果不想重名可以避开这些名字。 3.从最后一个问题可以看出,狗狗的评分是点赞数是相关的,呈正相关。点赞数越多,评分越高。