学习记录-0526

Python数据分析

1——来源:B站[小猿Python的主页]

  • numpy matplotlib pandas
  • RFM用户分层 用户生命周期(unactive,unreg,active,return,new)
# author:
# datetime:2022/5/25 9:13
# software: PyCharm
"""
文件说明:
"""
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from matplotlib import pyplot as plt

name = ['user_id', 'order_dt', 'order_product', 'order_amount']
df = pd.read_csv('./CDNOW_master.txt', header=None, sep='\s+', names=name)
df['order_dt'] = pd.to_datetime(df['order_dt'], format='%Y%m%d')
# print(df.head())
# print(df.info())
# print(df['order_product'].mean())
# print(df['order_amount'].mean())
df['month'] = df['order_dt'].astype('datetime64[M]')
# print(df.head())

# df.groupby(by='month')['order_amount'].sum().plot()
# df.groupby(by='month')['order_product'].sum().plot()
# df.groupby(by='month')['user_id'].nunique().plot()
# plt.show()

# df.groupby(by='user_id').count()['order_dt']
# df.groupby(by='user_id')['order_amount'].sum().plot()
# 先求和操作,再条件查询 2022-5-25 09:47:17
# df.groupby(by='user_id').sum().query('order_amount <= 1000')['order_amount'].hist()

# df.groupby('user_id')['month'].min()
# df.groupby('user_id')['month'].min().value_counts().plot()
# df.groupby('user_id')['month'].max().value_counts().plot()

# 新老客户
# old_new_user = df.groupby(by='user_id')['order_dt'].agg(['min', 'max'])
# old_new_user['min'] == old_new_user['max'].value_counts()
# print((old_new_user['min'] == old_new_user['max']).value_counts())


# 客户分层
# rfm = df.pivot_table(index='user_id', aggfunc={'order_amount': 'sum', 'order_product': 'sum',  'order_dt': 'max'})
# print(rfm.head())
# max_dt = df['order_dt'].max()  # 今天的日期
# rfm['R'] = (max_dt - df.groupby(by='user_id')['order_dt'].max()) / np.timedelta64(1, 'D')
# rfm.drop(labels='order_dt', inplace=True, axis=1)
# rfm.columns = ['M', 'F', 'R']
# print(rfm.head())
# print(type(rfm.loc[1]))
# aa = rfm.loc[1].map(lambda x: '1' if x>100 else '0')
# print(type(aa))
# print(aa)
# print(aa.R + aa.F + aa.M)

# df1 = DataFrame([[1,3], [4,7], [5,8]], columns=['A', 'B'])
# print(df1)
# print(df1.apply(lambda x: x.mean()))


# def rfm_func(series):
#     res = series.map(lambda x: '1' if x >= 0 else '0')
#     level = res.R + res.F + res.M
#     d = {
#         '000': '一般发展客户',
#         '001': '重要发展客户',
#         '010': '一般保持客户',
#         '011': '重要保持客户',
#         '100': '一般挽留客户',
#         '101': '重要挽留客户',
#         '110': '一般价值客户',
#         '111': '重要价值客户'
#     }
#     result = d[level]
#     return result
# rfm['label'] = rfm.apply(lambda x: x-x.mean()).apply(rfm_func, axis=1)
# print(rfm)

# 用户生命周期
ddd = df.pivot_table(index='user_id', values='order_dt', aggfunc='count', columns='month').fillna(0)
# print(ddd)
df_purchase = ddd.applymap(lambda x: 1 if x >= 1 else 0)
# print(df_purchase.head())
# 观望用户   unreg
# 非活跃用户 unactive
# 新用户     new
# 活跃用户   active
# 回头客     return

def active_purchase(data):
    status = [] # 某个用户每个月的活跃度
    for i in range(18):
        if data[i] == 0:  # 本月没有消费
            if len(status) > 0:
                if status[i-1] == 'unreg':
                    status.append('unreg')
                else:
                    status.append('unactive')
            else:
                status.append('unreg')
        else:  # 本月有消费
            if len(status) == 0:
                status.append('new')
            else:
                if status[i-1] == 'unactive':
                    status.append('return')
                elif status[i-1] == 'unreg':
                    status.append('new')
                else:
                    status.append('active')
    return status

pivot_status = df_purchase.apply(active_purchase, axis=1)
print(pivot_status.head())
data_active = pivot_status.values.tolist()
df_user_active = DataFrame(data=data_active, index=df_purchase.index, columns=df_purchase.columns)
# print(df_user_active.head(50).T)
print(df_user_active.apply(lambda x: pd.value_counts(x)).fillna(0).T)
  • Series DataFrame
  • axis的操作区分
  • 索引和切片 loc(), iloc(), query()

2——来源:B站(戴戴戴师兄)

2-1 sqlzoo习题

  • select * from nobel where subject not in (‘Chemistry’, ‘Medicine’) order by subject in (‘Chemistry’, ‘Physics’) 排在最后
  • select name from world where name like ‘%a%’ and name like '%o% and ’ name like ‘%e%’ and name like ‘%i%’ and name like ‘%u%’ and name not like ‘% %’ 包含所有元音字母,不包含空格
  • select winner, yr, subject from nobel order by yr desc, winner asc 降序与升序
  • select name, population from world limit 3, 4 查询第4行到第7行的数据[4567行]
  • select sum(population) from world where continent = ‘Africa’ 非洲总人口
  • select count(*) from world 表的总行数
  • select count(name) from world
  • select continent, count(name) from world group by continent group by 先分区在分组
  • select yr, subject, count(winner) from nobel where yr in (2013, 2014) group by yr, subject order by yr desc, count(winner) desc
  • select sum(population) from world where name in (‘Estonia’, ‘Latvia’ , ‘Lithuania’) 国家总人口
  • select continent, count(name) from world where population > 10000000 group by continent 国家个数
  • select continent, sum(population) ssm from world group by continent having ssm >= 100000000 总人口
  • select continent, avg(gdp) from worldwhere (gdp > 20000000000 and population > 60000000) or (gdp > 8000000000 and capital like ‘%a%a%a%’) group by continent having sum(population) >= 3000000000 order by count(name) desc limit 1
  • select continent, sum(population), sum(gdp) / sum(population) avg from world where gdp between 20000000000
    and 30000000000 group by continent having sum(gdp) / sum(population) > 3000 人均GDP
  • concat()、left()、right()、substring()、replace() 字符串处理
  • year()、datediff()、date_add()、date_sub()、date_format() 日期函数
  • if()、case when()、round()
  • select name, capital from world where substring(name,1,1) = substring(capital,1,1) and name != capital
  • select name, capital from world where left(name,1) = left(capital,1) and name != capital
  • select name, capital from world where length(name) < length(capital) and left(capital, length(name)) = name
  • select name, capital from world where name != capital and capital like concat(‘%’, name, ‘%’) 字段包含关系
  • lag(confirmed, 7) comnfirmed列,往上取值 [周同比]
  • select party, votes, rank() over(order by votes desc) as rank from ge where constituency = ‘S14000024’
    AND yr = 2017 order by party 窗口函数

3 来源——贪心科技(10周成为数据分析师)

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值