招聘网站—Hive数据分析
第1关:统计最热门的十种职业(招聘人数最多)
#进入hive hive #在hive中创建数据库 mydb create database mydb; #使用数据库 mydb use mydb; #创建表 recruitcleaned 并使用"/t"分割字段 create table recruitcleaned(job_name string,city string, salary string, workingExp string, edu string, company_name string, company_size string, welfare string, skill string) row format delimited fields terminated by '\t' stored as textfile; #将本地清洗后的数据导入recruitcleaned中 load data local inpath '/data/workspace/myshixun/data/advertise.txt' into table recruitcleaned; #创建 recruitersnumbers 表存放最热门的十种职业的数据 create table recruitersnumbers(job_name string, count int) row format delimited fields terminated by '\t' stored as textfile; #将结果存入至 recruitersnumbers 表中 insert overwrite table recruitersnumbers select job_name,count(*) count from recruitcleaned group by job_name order by count desc limit 10;
第2关:分析学历相同的情况下每种职位的平均薪资
#创建 edulevelsalary 表 create table edulevelsalary(edu string, job_name string,salary double) row format delimited fields terminated by '\t' stored as textfile; #将结果存入至 edulevelsalary 表中 insert overwrite table edulevelsalary select edu,job_name,avg(salary)salary from recruitcleaned group by edu,job_name;
第3关:统计各个职业要求具备的技能及其出现的次数
#创建 jobskill 表 create table jobskill(job_name string, skill string,count int) row format delimited fields terminated by '\t' stored as textfile; #将结果存入至 edulevelsalary 表中 insert overwrite table jobskill select job_name,skill, count(*) count from (select job_name,subview.* from recruitcleaned lateral view explode(split(skill,'\\|')) subview as skill)t1 group by job_name,skill order by job_name,count;
第4关:分析每种职业在各个薪资等级需要招聘的人数
#创建 jobsalarylevel 表 create table jobsalarylevel(job_name string, salary string,count int) row format delimited fields terminated by '\t' stored as textfile; #将结果存入至 edulevelsalary 表中 insert overwrite table jobsalarylevel select job_name,salary,count(*) count from (select job_name,case when 0<=recruitcleaned.salary and recruitcleaned.salary< 10 then 'primary' when 10<=recruitcleaned.salary and recruitcleaned.salary< 20 then 'intermediate' when recruitcleaned.salary>=20 then 'senior' else 'other' END as salary from recruitcleaned ) t1 group by job_name,salary;