PS:如果各位有疑问的话,可以留下微信,我看到会第一时间加的,以后可以多多交流
CREATE TABLE data1(姓 varchar(10),户籍地城市编号 text CHECK (length(户籍地城市编号)=6));
\COPY data1 FROM ‘C:\Users\Administrator\Desktop\数据可视化课程\考核\考核项目2_数据清洗及筛选综合实践\data01.csv’ WITH CSV HEADER ENCODING ‘utf8’
–导入data1数据,设置约束
CREATE TABLE data2(LIKE data1);
\COPY data2 FROM 'C:\Users\Administrator\Desktop\数据可视化课程\考核\考核项目2_数据清洗及筛选综合实践\data02.csv' WITH CSV HEADER ENCODING 'utf8'
--导入data2数据
CREATE TABLE cities(
行政编码 int,
省 varchar(10),
市 varchar(10),
区县 text,
lng numeric,
lat numeric);
\COPY cities FROM 'C:\Users\Administrator\Desktop\数据可视化课程\考核\考核项目2_数据清洗及筛选综合实践\中国行政代码对照表.csv' WITH CSV HEADER ENCODING 'utf8'
--导入cities数据
SELECT * FROM data1 WHERE
SUBSTRING(户籍地城市编号,1,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,2,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,3,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,4,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,5,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,6,1) NOT IN ('1','2','3','4','5','6','7','8','9','0');
--查看data1问题数据
SELECT * FROM data2 WHERE
SUBSTRING(户籍地城市编号,1,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,2,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,3,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,4,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,5,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,6,1) NOT IN ('1','2','3','4','5','6','7','8','9','0');
--查看data2问题数据
DELETE FROM data1 WHERE
SUBSTRING(户籍地城市编号,1,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,2,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,3,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,4,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,5,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,6,1) NOT IN ('1','2','3','4','5','6','7','8','9','0');
--删除data1问题数据
DELETE FROM data2 WHERE
SUBSTRING(户籍地城市编号,1,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,2,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,3,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,4,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,5,1) NOT IN ('1','2','3','4','5','6','7','8','9','0') OR
SUBSTRING(户籍地城市编号,6,1) NOT IN ('1','2','3','4','5','6','7','8','9','0');
--删除data2问题数据
ALTER TABLE data1 DROP CONSTRAINT data1_户籍地城市编号_check;
--删除约束
ALTER TABLE data1 ALTER COLUMN 户籍地城市编号 TYPE int USING(户籍地城市编号::int);
ALTER TABLE data2 ALTER COLUMN 户籍地城市编号 TYPE int USING(户籍地城市编号::int);
--修改两个表内户籍地城市编号的字符类型
CREATE TABLE data AS
SELECT * FROM data1 UNION ALL SELECT * FROM data2;
--合并data1和data2为data
CREATE TABLE result_data AS
SELECT a.姓,a.户籍地城市编号,b.省,b.市,b.区县,b.lng,b.lat
FROM data a INNER JOIN cities b
ON a.户籍地城市编号 = b.行政编码;
--连接data和cities为result_data
SELECT 姓,COUNT(姓),
CAST(COUNT(姓) AS numeric)/ (SELECT COUNT(姓) FROM result_data) AS 姓氏占比
FROM result_data GROUP BY 姓 ORDER BY COUNT(姓) DESC LIMIT 20;
--转换COUNT姓的格式后求出占比