京东用户购买意向预测

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime
import time
import seaborn as sns
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False   # 两段代码让图片能输出中文
# 先导入所有数据,并输出导入时间
path_Action2 = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201602.csv"
path_Action3 = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201603.csv"
path_Action4 = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201604.csv"
path_Comment = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Comment.csv"
path_Product = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Product.csv"
path_User = r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_User1.csv"

# [path_Action2,path_Action3,path_Action4,path_Comment,path_Product,path_User]
# 提取行为数据,方法一
def get_from_action_data(fname, chunk_size=500000):
    start = time.time()
    reader = pd.read_csv(fname, header=0, iterator=True)
    chunks = []
    loop = True
    while loop:
        try:
            chunk = reader.get_chunk(chunk_size)
            chunks.append(chunk)
        
        except StopIteration:
            loop = False
            print("Iteration is stopped")

    df_ac = pd.concat(chunks, ignore_index=True)
    end = time.time()
    print("累计耗时:{}s".format(int(end-start)))
    print(df_ac.head(10))

get_from_action_data(path_Action3)
Iteration is stopped
累计耗时:19
    user_id  sku_id                 time  model_id  type  cate  brand
0  280567.0  167208  2016-02-29 23:59:01       0.0     6     4    519
1  270248.0   35533  2016-02-29 23:59:02     111.0     6     4    306
2  203360.0   78694  2016-02-29 23:59:02       NaN     1     8    244
3  252369.0   90402  2016-02-29 23:59:03       0.0     6     7     38
4  279590.0  154208  2016-02-29 23:59:03       0.0     6     5    570
5  203360.0   78694  2016-02-29 23:59:03       0.0     6     8    244
6  279590.0  154208  2016-02-29 23:59:03       0.0     6     5    570
7  279590.0  154208  2016-02-29 23:59:03       NaN     1     5    570
8  252369.0   90402  2016-02-29 23:59:04      13.0     6     7     38
9  257109.0   95850  2016-02-29 23:59:04       0.0     6     8    800
# 检查数据的完整性
def check_action_user():
    user = pd.read_csv(r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_User.csv")
#     为了减少数据处理量,只需要提取主键做内连接即可
    user_ = user['user_id']
    df02 = pd.read_csv(r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201602.csv")
    print('2月数据是否完整:',(len(df02) == len(pd.merge(df02,user_))))
    
    df03 = pd.read_csv(r"C:\Users\Jupyter_data\笔记相关\数据练习\data\JData\JData_Action_201603.csv")
    print('3月数据是否完整:',(len(df03) == len(pd.merge(df03,user_))))
    
    df04 = pd
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值