技术栈:springboot+thymeleaf+mysql+echarts
1. 项目要求与内容
利用python爬取数据并进行清洗和预处理,将清洗后的数据存到mysql数据库中,后端利用springboot框架,用Echarts实现数据可视化。
2.数据爬取
# coding=utf-8
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
import csv
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.common.by import By
options = ChromeOptions()
options.add_argument('---headless') # 无头模式
options.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
f'Chrome/108.0.0.0 Safari/537.36')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
# 初始化浏览器为chrome浏览器
browser = webdriver.Chrome(options=options)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
def getData(url):
# 访问网址
browser.get(url)
browser.implicitly_wait(10)
# 设置浏览器大小:全屏
browser.minimize_window()
# 在搜索框元素输入要搜索的岗位
'''
browser.find_element(By.CSS_SELECTOR, ".home-body-wrapper .column-search-panel .ipt-search").send_keys(job)
# 点击搜索
browser.find_element(By.CSS_SELECTOR, ".search-panel-new .btn-search").click()
'''
# 等待6秒,试页面加载完成,否则会一直刷新页面直至页面加载完成
time.sleep(10)
'''定义列表,分别为:地址、岗位名称、公司名称、薪资、经验要求、学历要求、待遇'''
'''创建表头'''
# with open('数据.csv', 'a+', newline='', encoding='utf-8-sig') as csvfile:
# fieldnames = ['address', 'job_name', 'company_name', 'company_type', 'company_people', 'salary',
# 'experience', 'education', 'skills', 'benefits', 'job_desc'] # 表头·
# writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# writer.writeheader()
for i in range(1, 11):
browser.implicitly_wait(10)
ul = browser.find_elements(By.CSS_SELECTOR, '.job-card-wrapper')
time.sleep(2)
print("开始爬取第" + str(i) + "页信息")
if len(ul) >= 1:
for li in ul:
try:
time.sleep(2)
'''获取岗位地址'''
address = li.find_element(By.CSS_SELECTOR,
'.job-card-wrapper .job-card-left .job-area-wrapper .job-area').text
# address1 = address.split('·')[0]
'''获取岗位名称'''
job_name = li.find_element(By.CSS_SELECTOR,
'.job-card-wrapper .job-card-left .job-name').text
'''获取公司名称'''
company = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .job-card-right .company-name a').text
'''公司类型'''
company_type = li.find_element(By.CSS_SELECTOR,
'#wrap > div.page-job-wrapper > div.page-job-inner > '
'div > div.job-list-wrapper > div.search-job-result > '
'ul > li > div.job-card-body.clearfix > div > '
'div.company-info > ul > li:nth-child(1)').text
'''公司规模'''
company_people = li.find_element(By.CSS_SELECTOR,
'#wrap > div.page-job-wrapper > div.page-job-inner '
'> div > div.job-list-wrapper > '
'div.search-job-result > ul > li> '
'div.job-card-body.clearfix > div > '
'div.company-info > ul > li:last-child').text
'''获取薪资水平'''
money = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .job-card-left .salary').text
'''经验'''
experience = li.find_element(By.CSS_SELECTOR,
'.job-card-wrapper .job-card-left .tag-list :first-child').text
experience = str(experience)
'''获取学历要求'''
education = li.find_element(By.CSS_SELECTOR,
'.job-card-wrapper .job-card-left .tag-list li+li').text
if '月' in education:
education = '本科'
else:
education = education.strip('\n')
'''技能要求'''
skill_list = li.find_elements(By.CSS_SELECTOR,
'#wrap > div.page-job-wrapper > div.page-job-inner > '
'div > div.job-list-wrapper > div.search-job-result > '
'ul > li > div.job-card-footer.clearfix > ul > li')
skill = []
for skill_i in skill_list:
skill_i_text = skill_i.text
if len(skill_i_text) == 0:
continue
skill.append(skill_i_text)
skill = str(skill)
'''福利待遇'''
benefit = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .info-desc').text
try:
'''岗位描述'''
li.find_element(By.CSS_SELECTOR,
"#wrap > div.page-job-wrapper > div.page-job-inner > div > "
"div.job-list-wrapper > div.search-job-result > ul > li > "
"div.job-card-body.clearfix > a").click()
except ElementClickInterceptedException:
print("正在关闭弹窗")
browser.find_element(By.CSS_SELECTOR, ".boss-login-dialog-content .boss-login-dialog-header .boss-login-close").click()
print("关闭成功")
li.find_element(By.CSS_SELECTOR,
"#wrap > div.page-job-wrapper > div.page-job-inner > div > "
"div.job-list-wrapper > div.search-job-result > ul > li > "
"div.job-card-body.clearfix > a").click()
# browser.execute_script('$(".login-dialog-wrapper").css("display","none")')
# 找到详情页url并打开
time.sleep(5)
# 将窗口移动到最后一个标签页
browser.switch_to.window(browser.window_handles[-1])
job_details = browser.find_element(By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]').text
# job_details = browser.find_element_by_xpath('//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]').text
# print(job_details)
time.sleep(1)
# 关闭详情页
browser.close()
browser.switch_to.window(browser.window_handles[-1])
'''打印输出'''
print(
address + ',' + job_name + ',' + company + ',' + company_type + ',' + company_people + ','
+ money + ',' + experience + ',' + education + ',' + skill + ',' + benefit + ',' + job_details.replace(
"\n", ""))
with open('数据1.csv', 'a+', newline='', encoding='utf-8-sig') as csvfile:
fieldnames = ['address', 'job_name', 'company_name', 'company_type', 'company_people', 'salary',
'experience', 'education', 'skills', 'benefits', 'job_desc']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'address': address, 'job_name': job_name, 'company_name': company,
'company_type': company_type, 'company_people': company_people, 'salary': money,
'experience': experience, 'education': education, 'skills': skill,
'benefits': benefit,
'job_desc': job_details.replace("\n", "")
})
time.sleep(10)
except UnicodeEncodeError:
continue
time.sleep(10)
'''利用滑块,使得页面得以跳动,模拟人工'''
js = 'window.scrollTo(0,2000)'
browser.execute_script(js) # 读不懂就对了,这是js代码,滑动滑块的
time.sleep(3)
browser.find_element(By.CSS_SELECTOR, "#wrap > div.page-job-wrapper > div.page-job-inner > div > "
"div.job-list-wrapper > div.search-job-result > div > div > div > "
"a:last-child").click()
time.sleep(8)
else:
print('没有内容,停止运行')
break
if __name__ == '__main__':
# 数据开发、数据分析、ETL、数据仓库、数据挖掘、"ETL工程师","数据仓库",
job_name = ["数据挖掘"]
for job in job_name:
# 北京、上海、广州、深圳、杭州、天津、西安、
# 苏州、武汉、厦门、长沙、成都、郑州、重庆
#"101010100","101020100", "101280100","101280600","101210100","101030100","101110100","101190400",
place = ["101200100","101230200","101250100","101270100","101180100","101040100"]
i = 0
print("开始爬取"+str(job)+"的岗位信息")
for p in place:
job_url = "https://www.zhipin.com/web/geek/job?query=" + job + "&city=" + p
getData(job_url)
i = i + 1
print(str(job)+"的第" + str(i) + "所城市爬取完成")
print(str(job)+"岗位爬取完成")
结果展示:
2.数据分析
package Job.DataProcess
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, desc, round}
/**
* 数据开发岗位
*/
object DataDev {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[4]")
.appName("Test")
.getOrCreate()
val data = spark.read
.option("header", value = true)
.option("delimiter", ",")
.option("inferSchema", value = true)
.csv("file:\\D:\\桌面文件\\毕设\\数据\\招聘数据.csv")
/**
* 各行业用人需求占比
*/
data.distinct()
.select("company_type")
.filter(col("job_name").like("%数据开发%"))
.filter(col("company_type").=!=(""))
.groupBy("company_type")
.count()
.orderBy(desc("count"))
.limit(10)
.show()
/**
* 经验需求和薪资分析
*/
val experienceCount = data
.distinct()
.select("experience")
.filter(col("job_name").like("%数据开发%"))
.filter(col("experience").=!=(""))
.groupBy("experience")
.count()
val experienceSalary = data
.distinct()
.filter(col("job_name").like("%数据开发%"))
.filter(col("experience").=!=(""))
.groupBy("experience")
.agg("salary" -> "avg")
.select(col("experience"), round(col("avg(salary)")) as "avg_salary")
experienceCount.join(experienceSalary, "experience")
.orderBy(desc("count"))
.limit(6).show()
/**
* 学历需求和薪资分析
*/
val educationCount = data
.distinct()
.select("education")
.filter(col("job_name").like("%数据开发%"))
.filter(col("education").=!=(""))
.groupBy("education")
.count()
val educationSalary = data
.distinct()
.filter(col("job_name").like("%数据开发%"))
.filter(col("education").=!=(""))
.groupBy("education")
.agg("salary" -> "avg")
.select(col("education"), round(col("avg(salary)")) as "avg_salary")
educationCount.join(educationSalary, "education")
.orderBy(desc("count")).limit(4).show()
/**
* 不同公司规模用人情况
*/
val people = data
.distinct()
.select("company_people")
.filter(col("job_name").like("%数据开发%"))
.filter(col("company_people").=!=(""))
.groupBy("company_people")
.count()
.orderBy(desc("count"))
.limit(10)
.show()
/**
* 不同城市平均薪资分析
*/
val city = data
.distinct()
.select("address")
.filter(col("job_name").like("%数据开发%"))
.groupBy("address")
.count()
val citySalary = data
.distinct()
.filter(col("job_name").like("%数据开发%"))
.groupBy("address")
.agg("salary" -> "avg")
.select(col("address"), round(col("avg(salary)")) as "avg_salary")
city.join(citySalary, "address")
.as("city")
.orderBy(desc("count")).show()
}
}