学习记录-0526

Onssd

已于 2022-08-05 19:29:06 修改

阅读量78

点赞数

分类专栏： MYSQL 文章标签：学习 python 数据分析

于 2022-05-26 17:48:57 首次发布

仅限个人学习使用

本文链接：https://blog.csdn.net/Onssd/article/details/124989305

版权

MYSQL 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

Python数据分析

1——来源：B站[小猿Python的主页]

numpy matplotlib pandas
RFM用户分层用户生命周期(unactive,unreg,active,return,new)

# author:
# datetime:2022/5/25 9:13
# software: PyCharm
"""
文件说明：
"""
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from matplotlib import pyplot as plt

name = ['user_id', 'order_dt', 'order_product', 'order_amount']
df = pd.read_csv('./CDNOW_master.txt', header=None, sep='\s+', names=name)
df['order_dt'] = pd.to_datetime(df['order_dt'], format='%Y%m%d')
# print(df.head())
# print(df.info())
# print(df['order_product'].mean())
# print(df['order_amount'].mean())
df['month'] = df['order_dt'].astype('datetime64[M]')
# print(df.head())

# df.groupby(by='month')['order_amount'].sum().plot()
# df.groupby(by='month')['order_product'].sum().plot()
# df.groupby(by='month')['user_id'].nunique().plot()
# plt.show()

# df.groupby(by='user_id').count()['order_dt']
# df.groupby(by='user_id')['order_amount'].sum().plot()
# 先求和操作,再条件查询 2022-5-25 09:47:17
# df.groupby(by='user_id').sum().query('order_amount <= 1000')['order_amount'].hist()

# df.groupby('user_id')['month'].min()
# df.groupby('user_id')['month'].min().value_counts().plot()
# df.groupby('user_id')['month'].max().value_counts().plot()

# 新老客户
# old_new_user = df.groupby(by='user_id')['order_dt'].agg(['min', 'max'])
# old_new_user['min'] == old_new_user['max'].value_counts()
# print((old_new_user['min'] == old_new_user['max']).value_counts())


# 客户分层
# rfm = df.pivot_table(index='user_id', aggfunc={'order_amount': 'sum', 'order_product': 'sum',  'order_dt': 'max'})
# print(rfm.head())
# max_dt = df['order_dt'].max()  # 今天的日期
# rfm['R'] = (max_dt - df.groupby(by='user_id')['order_dt'].max()) / np.timedelta64(1, 'D')
# rfm.drop(labels='order_dt', inplace=True, axis=1)
# rfm.columns = ['M', 'F', 'R']
# print(rfm.head())
# print(type(rfm.loc[1]))
# aa = rfm.loc[1].map(lambda x: '1' if x>100 else '0')
# print(type(aa))
# print(aa)
# print(aa.R + aa.F + aa.M)

# df1 = DataFrame([[1,3], [4,7], [5,8]], columns=['A', 'B'])
# print(df1)
# print(df1.apply(lambda x: x.mean()))


# def rfm_func(series):
#     res = series.map(lambda x: '1' if x >= 0 else '0')
#     level = res.R + res.F + res.M
#     d = {
#         '000': '一般发展客户',
#         '001': '重要发展客户',
#         '010': '一般保持客户',
#         '011': '重要保持客户',
#         '100': '一般挽留客户',
#         '101': '重要挽留客户',
#         '110': '一般价值客户',
#         '111': '重要价值客户'
#     }
#     result = d[level]
#     return result
# rfm['label'] = rfm.apply(lambda x: x-x.mean()).apply(rfm_func, axis=1)
# print(rfm)

# 用户生命周期
ddd = df.pivot_table(index='user_id', values='order_dt', aggfunc='count', columns='month').fillna(0)
# print(ddd)
df_purchase = ddd.applymap(lambda x: 1 if x >= 1 else 0)
# print(df_purchase.head())
# 观望用户   unreg
# 非活跃用户 unactive
# 新用户     new
# 活跃用户   active
# 回头客     return

def active_purchase(data):
    status = [] # 某个用户每个月的活跃度
    for i in range(18):
        if data[i] == 0:  # 本月没有消费
            if len(status) > 0:
                if status[i-1] == 'unreg':
                    status.append('unreg')
                else:
                    status.append('unactive')
            else:
                status.append('unreg')
        else:  # 本月有消费
            if len(status) == 0:
                status.append('new')
            else:
                if status[i-1] == 'unactive':
                    status.append('return')
                elif status[i-1] == 'unreg':
                    status.append('new')
                else:
                    status.append('active')
    return status

pivot_status = df_purchase.apply(active_purchase, axis=1)
print(pivot_status.head())
data_active = pivot_status.values.tolist()
df_user_active = DataFrame(data=data_active, index=df_purchase.index, columns=df_purchase.columns)
# print(df_user_active.head(50).T)
print(df_user_active.apply(lambda x: pd.value_counts(x)).fillna(0).T)

Series DataFrame
axis的操作区分
索引和切片 loc(), iloc(), query()

2——来源：B站(戴戴戴师兄)

2-1 sqlzoo习题

select * from nobel where subject not in (‘Chemistry’, ‘Medicine’) order by subject in (‘Chemistry’, ‘Physics’) 排在最后
select name from world where name like ‘%a%’ and name like '%o% and ’ name like ‘%e%’ and name like ‘%i%’ and name like ‘%u%’ and name not like ‘% %’ 包含所有元音字母，不包含空格
select winner, yr, subject from nobel order by yr desc, winner asc 降序与升序
select name, population from world limit 3, 4 查询第4行到第7行的数据[4567行]
select sum(population) from world where continent = ‘Africa’ 非洲总人口
select count(*) from world 表的总行数
select count(name) from world
select continent, count(name) from world group by continent group by 先分区在分组
select yr, subject, count(winner) from nobel where yr in (2013, 2014) group by yr, subject order by yr desc, count(winner) desc
select sum(population) from world where name in (‘Estonia’, ‘Latvia’ , ‘Lithuania’) 国家总人口
select continent, count(name) from world where population > 10000000 group by continent 国家个数
select continent, sum(population) ssm from world group by continent having ssm >= 100000000 总人口
select continent, avg(gdp) from worldwhere (gdp > 20000000000 and population > 60000000) or (gdp > 8000000000 and capital like ‘%a%a%a%’) group by continent having sum(population) >= 3000000000 order by count(name) desc limit 1
select continent, sum(population), sum(gdp) / sum(population) avg from world where gdp between 20000000000
and 30000000000 group by continent having sum(gdp) / sum(population) > 3000 人均GDP
concat()、left()、right()、substring()、replace() 字符串处理
year()、datediff()、date_add()、date_sub()、date_format() 日期函数
if()、case when()、round()
select name, capital from world where substring(name,1,1) = substring(capital,1,1) and name != capital
select name, capital from world where left(name,1) = left(capital,1) and name != capital
select name, capital from world where length(name) < length(capital) and left(capital, length(name)) = name
select name, capital from world where name != capital and capital like concat(‘%’, name, ‘%’) 字段包含关系
lag(confirmed, 7) comnfirmed列，往上取值 [周同比]
select party, votes, rank() over(order by votes desc) as rank from ge where constituency = ‘S14000024’
AND yr = 2017 order by party 窗口函数