本文使用的数据集下载链接:
https://download.csdn.net/download/shangjg03/88478086
1.准备数据
该数据集包含有关英格兰和威尔士自1995年起到2023年的房地产价格的数据,超过2800万条记录,未压缩形式的数据集大小超过4GB,在ClickHouse中需要约306MB。
2.hive中建表
create table uk_price_paid(
id string,
price int,
trans_date date,
postcode string,
type string,
is_new string,
duration string,
addr1 string,
addr2 string,
street string,
locality string,
town string,
district string,
county string,
category string
)
row format delimited fields terminated by '#' lines terminated by '\n' stored as textfile;
查看表
3.数据预处理
由于数据中有部分字段包含逗号‘,’,所以数据要预处理一下。
# 查看文件中包含某个字符
cat pp-complete.csv|grep '#'
# 全文修改一个字符到另一个字符
sed 's/","/"#"/g' pp-complete.csv > pp-complete-ext.csv
sed 's/"#"/#/g' pp-complete.csv > pp-complete3.csv
4. 导入数据
load data local inpath '/home/datasets/pp-complete4.csv' into table uk_price_paid;
导入成功。
查看数据
select * from uk_price_paid limit 10;
5. 使用数据
5.1 统计数据总量
select count(*) from uk_price_paid;
5.2 统计每年的平均价格
SELECT year(trans_date) year1, round(avg(price)) price from uk_price_paid GROUP BY year(trans_date) ORDER BY year(trans_date);
5.3伦敦房产每年的平均价格
SELECT year(trans_date) as year, round(avg(price)) AS price FROM uk_price_paid WHERE town = 'LONDON' GROUP BY year(trans_date) ORDER BY year(trans_date);
5.4 2020年之后最昂贵的10个街区
SELECT town, district, count() as c, round(avg(price)) AS price
FROM uk_price_paid
WHERE date >= '2020-01-01'
GROUP BY town, district
HAVING c >= 100
ORDER BY price DESC
LIMIT 10;