基于hive分析Flask为后端框架echarts为前端框架的招聘网站可视化大屏项目
1. 项目概述
项目目标是构建一个大数据分析系统,包含以下核心模块:
1、数据爬取:通过request请求获取猎聘网的就业数据。
2、数据存储和分析:使用 Hive 进行数据存储和分析。
3、数据迁移:使用sqoop将hive数据导入mysql。
4、后端服务:使用 Flask 搭建数据接口,将分析结果提供给前端。
5、数据可视化:使用 ECharts 制作大屏展示,实现数据的图形化呈现。
2. 项目环境准备
在开始之前,需要搭建如下环境:
Hive:作为数据仓库,用于存储和分析数据。
Flask:轻量级 Python Web 框架,用于构建后端 RESTful API。
ECharts:JavaScript 图表库,用于前端数据可视化。
MySQL:用于保存一些系统配置或小规模数据。
Sqoop:数据同步工具,将hive数据同步到mysql。
3、数据爬取
通过python获取猎聘网的照片信息,存储到csv文件里
import csv
import time
import requests
import execjs
from storage.csv2mysql import sync_data2db
f = open('../storage/data.csv', mode='a', encoding='utf-8')
csv_writer = csv.DictWriter(f,fieldnames=[
'职位',
'城市',
'薪资',
'经验',
'标签',
'公司',
'公司领域',
'公司规模'])
csv_writer.writeheader()
def read_js_code():
f= open('/Users/shareit/workspace/chart_show/demo.js',encoding='utf-8')
txt = f.read()
js_code = execjs.compile(txt)
ckId = js_code.call('r',32)
return ckId
def post_data():
read_js_code()
url = "https://api-c.liepin.com/api/com.liepin.searchfront4c.pc-search-job"
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Sec-Ch-Ua-Platform':'macOS',
'Content-Length': '398',
'Content-Type': 'application/json;charset=UTF-8;',
'Host': 'api-c.liepin.com',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Origin': 'https://www.liepin.com',
'Referer': 'https://www.liepin.com/',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'X-Client-Type': 'web',
'X-Fscp-Bi-Stat': '{"location": "https://www.liepin.com/zhaopin"}',
'X-Fscp-Fe-Version': '',
'X-Fscp-Std-Info': '{"client_id": "40108"}',
'X-Fscp-Trace-Id': '52262313-e6ca-4cfd-bb67-41b4a32b8bb5',
'X-Fscp-Version': '1.1',
'X-Requested-With': 'XMLHttpRequest',
}
list = ["H01$H0001","H01$H0002",
"H01$H0003","H01$H0004","H01$H0005",
"H01$H0006","H01$H0007","H01$H0008",
"H01$H0009","H01$H00010","H02$H0018","H02$H0019","H03$H0022",
"H03$H0023","H03$H0024","H03$H0025","H04$H0030","H04$H0031",
"H04$H0032","H05$H05","H06$H06","H07$H07","H08$H08"]
for name in list:
print("-------{}---------".format(name))
for i in range(10):
print("------------第{}页-----------".format(i))
data = {
"data": {
"mainSearchPcConditionForm":
{
"city": "410", "dq": "410", "pubTime": "", "currentPage": i, "pageSize": 40, "key": "",
"suggestTag": "", "workYearCode": "1", "compId": "", "compName": "", "compTag": "",
"industry": name, "salary": "", "jobKind": "", "compScale": "", "compKind": "", "compStage": "",
"eduLevel": ""},
"passThroughForm":
{
"scene": "page", "skId": "z33lm3jhwza7k1xjvcyn8lb8e9ghxx1b",
"fkId": "z33lm3jhwza7k1xjvcyn8lb8e9ghxx1b",
"ckId": read_js_code(),
'sfrom': 'search_job_pc'}}}
response = requests.post(url=url, json=data, headers=headers)
time.sleep(2)
parse_data(response)
def parse_data(response):
try:
jobCardList = response.json()['data']['data']['jobCardList']
except Exception as e:
return
4、加载hive数据进行分析
1、将storage下的data.csv上传到虚拟机上
2、创建work_base表,并将data.csv数据加载到hive表里
CREATE TABLE work_base (
id INT COMMENT 'id',
title STRING COMMENT '标题',
city STRING COMMENT '城市',
salary STRING COMMENT '薪资',
campus_job_kind STRING COMMENT '经验',
labels STRING COMMENT '标签',
compName STRING COMMENT '公司',
compIndustry STRING COMMENT '公司领域',
compScale STRING COMMENT '公司规模'
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;
LOAD local DATA INPATH './data.csv' OVERWRITE INTO TABLE flask_work.work_base;
3、创建hive ads层数仓表进行分析
-- 4. 热门公司分析
CREATE TABLE top_companies (
company_name STRING COMMENT '公司名称',
job_count INT COMMENT '职位数量'
) STORED AS TEXTFILE;
INSERT INTO top_companies
SELECT compName, COUNT(*) AS job_count
FROM work_base
GROUP BY compName
ORDER BY job_count DESC
LIMIT 10;
-- 5. 岗位分布情况分析
CREATE TABLE job_distribution (
job_title STRING COMMENT '岗位名称',
job_count INT COMMENT '职位数量'
) STORED AS TEXTFILE;
INSERT INTO job_distribution
SELECT title, COUNT(*) AS job_count
FROM work_base
GROUP BY title;
-- 6. 学历要求分析
CREATE TABLE education_requirements (
education_level STRING COMMENT '学历要求',
job_count INT COMMENT '职位数量'
) STORED AS TEXTFILE;
INSERT INTO education_requirements
SELECT
CASE
WHEN labels LIKE '%博士%' THEN '博士'
WHEN labels LIKE '%硕士%' THEN '硕士'
WHEN labels LIKE '%本科%' THEN '本科'
WHEN labels LIKE '%大专%' THEN '大专'
ELSE '其他'
END AS education_level,
COUNT(*) AS job_count
FROM work_base
GROUP BY education_level;
-- 7. 薪资待遇分析(各个城市的平均薪资)
CREATE TABLE city_salary_analysis (
city STRING COMMENT '城市',
avg_salary DOUBLE COMMENT '平均薪资'
) STORED AS TEXTFILE;
INSERT INTO city_salary_analysis
SELECT city, AVG(CAST(salary AS DOUBLE)) AS avg_salary
FROM work_base
WHERE salary RLIKE '^[0-9]+$'
GROUP BY city;
5、将hive分析的结果数据导入mysql
使用sqoop迁移数据
sqoop export \
--connect jdbc:mysql://localhost:3306/flask_work \
--username root --password '123456' \
--table city_job_count \
--export-dir /hive/warehouse/flask_work.db/flask_work.city_job_count \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n';
sqoop export \
--connect jdbc:mysql:// localhost:3306/flask_work \
--username root --password 123456 \
--table job_salary_analysis \
--export-dir /user/hive/warehouse/flask_work.db/flask_work.job_salary_analysis \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
sqoop export \
--connect jdbc:mysql:// localhost:3306/flask_work \
--username root --password 123456 \
--table top_companies \
--export-dir /user/hive/warehouse/flask_work.db/flask_work.top_companies \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
sqoop export \
--connect jdbc:mysql:// localhost:3306/flask_work \
--username root --password 123456 \
--table job_distribution \
--export-dir /user/hive/warehouse/flask_work.db/flask_work.job_distribution \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
6. 后端服务(Flask)
使用 Flask 构建后端服务,编写rest api,读取mysql数据提供给前端页面进行展示
app.py
from flask import Flask, render_template, request, flash, redirect, url_for
from data import *
from service.task_service import get_user, register_user
app = Flask(__name__)
app.secret_key = 'b6b52fae-5618-4805-b368-501c62c6d1df'
@app.after_request
def add_header(response):
response.cache_control.max_age = 0
return response
@app.route('/', methods=['GET', 'POST'])
def login():
if request.method == 'POST':
username = request.form['username']
password = request.form['password']
user = get_user(username, password)
# 检查用户是否存在
if user is not None:
data = SourceData()
return render_template('index.html', form=data, title=data.title)
else:
# 用户名或密码错误,显示错误消息
flash('用户名或密码错误')
return redirect(url_for('login')) # 重定向回登录页面
# 如果是 GET 请求,则直接返回登录页面
return render_template('login.html')
@app.route('/register', methods=['GET', 'POST'])
def register():
if request.method == 'POST':
username = request.form.get('username')
password = request.form.get('password')
if username and password:
register_user(username, password) # 确保此函数已定义
return "注册成功!"
flash('用户名和密码不能为空')
return redirect(url_for('register'))
return render_template('register.html')
if __name__ == "__main__":
app.run(host='127.0.0.1', debug=False)
task_service.py
import pymysql
db_config = {
'host': '127.0.0.1',
'user': 'root',
'password': '12345678',
'database': 'flask_work',
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor
}
connection = pymysql.connect(**db_config)
def get_user(username,password):
try:
with connection.cursor() as cursor:
select_query = "select * from user where username = %s and password = %s"
cursor.execute(select_query,(username,password))
result = cursor.fetchall()
return result[0]
except Exception as e:
print(e)
return None
def get_title_count():
try:
with connection.cursor() as cursor:
select_query = "select count(distinct(city)) city,count(distinct(compName)) compName from work_base;"
cursor.execute(select_query)
result = cursor.fetchall()
a=result[0]['city']
b=result[0]['compName']
return a,b
except Exception as e:
print(e)
return None
def work_count_by_city():
try:
with connection.cursor() as cursor:
select_query = "select city,job_count from city_job_count order by job_count desc limit 10"
cursor.execute(select_query)
result = cursor.fetchall()
re_list = []
for re in result:
re_list.append({
"name": re['city'], "value": re['job_count']})
print(re_list)
return re_list
except Exception as e:
print(e)
return None
def work_avg_salary():
try:
with connection.cursor() as cursor:
select_query = "select job_title,avg_salary from job_salary_analysis order by avg_salary desc limit 10;"
cursor.execute(select_query)
result = cursor.fetchall()
re_list = []
for re in result:
re_list.append({
"name": re['job_title'][0:8], "value": int(re['avg_salary'])})
print(re_list)
return re_list
except Exception as e:
print(e)
return None
def top_companies():
try:
with connection.cursor() as cursor:
select_query = "select company_name,job_count from top_companies limit 3;"
cursor.execute(select_query)
result = cursor.fetchall()
re_list = []
for re in result:
re_list.append({
"name": re['company_name'], "value": re['job_count']})
print(re_list)
return re_list
except Exception as e:
print(e)
return None
def job_distribution_count():
try:
with connection.cursor() as cursor:
select_query = "select job_title,job_count from job_distribution order by job_count desc limit 10;"
cursor.execute(select_query)
result = cursor.fetchall()

最低0.47元/天 解锁文章
2952

被折叠的 条评论
为什么被折叠?



