import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime
import time
import seaborn as sns
from matplotlib import pyplot as plt
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
path_Action2 = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201602.csv"
path_Action3 = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201603.csv"
path_Action4 = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201604.csv"
path_Comment = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Comment.csv"
path_Product = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Product.csv"
path_User = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_User1.csv"
def get_from_action_data ( fname, chunk_size= 500000 ) :
start = time. time( )
reader = pd. read_csv( fname, header= 0 , iterator= True )
chunks = [ ]
loop = True
while loop:
try :
chunk = reader. get_chunk( chunk_size)
chunks. append( chunk)
except StopIteration:
loop = False
print ( "Iteration is stopped" )
df_ac = pd. concat( chunks, ignore_index= True )
end = time. time( )
print ( "累计耗时:{}s" . format ( int ( end- start) ) )
print ( df_ac. head( 10 ) )
get_from_action_data( path_Action3)
Iteration is stopped
累计耗时:19
user_id sku_id time model_id type cate brand
0 280567.0 167208 2016-02-29 23:59:01 0.0 6 4 519
1 270248.0 35533 2016-02-29 23:59:02 111.0 6 4 306
2 203360.0 78694 2016-02-29 23:59:02 NaN 1 8 244
3 252369.0 90402 2016-02-29 23:59:03 0.0 6 7 38
4 279590.0 154208 2016-02-29 23:59:03 0.0 6 5 570
5 203360.0 78694 2016-02-29 23:59:03 0.0 6 8 244
6 279590.0 154208 2016-02-29 23:59:03 0.0 6 5 570
7 279590.0 154208 2016-02-29 23:59:03 NaN 1 5 570
8 252369.0 90402 2016-02-29 23:59:04 13.0 6 7 38
9 257109.0 95850 2016-02-29 23:59:04 0.0 6 8 800
def check_action_user ( ) :
user = pd. read_csv( r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_User.csv" )
user_ = user[ 'user_id' ]
df02 = pd. read_csv( r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201602.csv" )
print ( '2月数据是否完整:' , ( len ( df02) == len ( pd. merge( df02, user_) ) ) )
df03 = pd. read_csv( r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201603.csv" )
print ( '3月数据是否完整:' , ( len ( df03) == len ( pd. merge( df03, user_) ) ) )
df04 = pd