使用英国某零售平台交易数据
# 查看特征和类型
# 发票编号、商品名称、简介、数量、日期、单价、用户ID、国家
DESC uk
# 查看前5行
SELECT * FROM uk LIMIT 5;
1 缺失值检测
# 总行数、各字段的行数(是否存在缺失值)
SELECT COUNT(*),COUNT(invoiceno),COUNT(stockcode),COUNT(description),
COUNT(quantity),COUNT(invoicedate),COUNT(unitprice),COUNT(customerid),
COUNT(country) FROM uk;
# 共541909行,其中发票编号、描述、customerid存在缺失
2 异常值检测
数值型的数量和单价可能存在异常
【数量和单价】
SELECT MAX(quantity),MIN(quantity),MAX(unitprice),MIN(unitprice)
FROM uk;
SELECT COUNT(*) FROM uk WHERE quantity<=0 # 10000多条
SELECT COUNT(*) FROM uk WHERE unitprice<=0 # 2条异常
# 去掉2条小于0的,将10000条的变为正数
【样本时间范围】-2010-12-01到2011-12-09
SELECT MAX(invoicedate),MIN(invoicedate) FROM uk;
【每月记录数】-没有很少的
SELECT YEAR(invoicedate),MONTH(invoicedate),COUNT(*)
FROM uk
GROUP BY YEAR(invoicedate),MONTH(invoicedate);
3 重复值检测-536641,541909存在重复值
SELECT COUNT(*) FROM (SELECT DISTINCT * FROM uk) t;
SELECT COUNT(*) FROM uk;
4 处理缺失、异常、重复
# quantity<0的变为正
UPDATE uk
SET quantity=quantity*(-1)
WHERE quantity<0
# 删除customerid为空,unitprice<0,quantity=0
DELETE FROM uk
WHERE CustomerID IS NULL
OR unitprice<=0 OR quantity=0
# 再次查看是否缺失,总记录406789,invoiceNo397884条
SELECT COUNT(invoiceno),COUNT(stockcode),COUNT(description),
COUNT(quantity),COUNT(invoicedate),COUNT(unitprice),COUNT(customerid),
COUNT(country) FROM uk
# 再次查看是否存在重复值-存在
SELECT COUNT(*) FROM (SELECT DISTINCT * FROM uk) t;
SELECT COUNT(*) FROM uk;
# 建新表,去重,删去原表
CREATE TABLE uk2 AS
SELECT DISTINCT * FROM uk;
DROP TABLE uk;
# invoiceNo缺失比例计算-各月比例大致相同,直接删除对订单月分布影响不大
SELECT YEAR(invoicedate) `年`,MONTH(invoicedate) `月`,
SUM(InvoiceNo IS NULL) `缺失数`,COUNT(*) `记录数`,ROUND(SUM(InvoiceNo IS NULL)/COUNT(*),2) `缺失比例`
FROM uk2
GROUP BY YEAR(invoicedate),MONTH(invoicedate)
# 删除缺失
DELETE FROM uk2 WHERE Invoiceno IS NULL;
交易总体状况
SELECT COUNT(DISTINCT invoiceno) `总订单量`,ROUND(SUM(unitprice*quantity),2) `总订单金额`,
COUNT(DISTINCT customerid) `消费者数量`,SUM(quantity) `商品总销量`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT customerid),2) `客单价`,
ROUND(SUM(quantity)/COUNT(DISTINCT invoiceno),2) `连带率`
FROM uk2;
# 每月订单数、订单金额、消费者数量、商品总量
SELECT DATE_FORMAT(invoicedate,'%Y年%m月') `月份`,
COUNT(DISTINCT invoiceno) `订单数`,
ROUND(SUM(unitprice*quantity),2) `订单金额`,
COUNT(DISTINCT customerid) `消费者数量`,
SUM(quantity) `出售商品总量`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT customerid),2) `客单价`,
ROUND(SUM(quantity)/COUNT(DISTINCT customerid),2) `连带率`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT invoiceno),2) `每单平均金额`,
ROUND(SUM(quantity)/COUNT(DISTINCT invoiceno)) `每单平均数量`
FROM uk2
GROUP BY DATE_FORMAT(invoicedate,'%Y年%m月')
# 10月、11月是订单高峰期
一、订单量分布
按交易地区
订单top5的国家
SELECT country,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),3) `比例`
FROM uk2 GROUP BY country ORDER BY `订单数` DESC LIMIT 5;
按交易时间
一个月内各天订单数及比例
SELECT DAY(invoicedate) `日`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),2) `比例`
FROM uk2
GROUP BY DAY(invoicedate)
ORDER BY `日`;
# 月末订单明显少
星期内订单数及比例
SELECT DAYOFWEEK(invoicedate) `星期`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoi