from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver import ActionChains
import gc
from selenium.webdriver.common.keys import Keys
import csv
from matplotlib import rcParams ## run command settings for plotting
config = {
"mathtext.fontset":'stix',
"font.family":'serif',
"font.serif": ['SimHei'],
"font.size": 10, # 字号
'axes.unicode_minus': False # 处理负号,即-号
}
rcParams.update(config) ## 设置画图的一些参数
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 屏蔽保存密码提示框
prefs = {'credentials_enable_service': False, 'profile.password_manager_enabled': False}
options.add_experimental_option('prefs', prefs)
# 反爬虫特征处理
options.add_argument('--disable-blink-features=AutomationControlled')
web = webdriver.Chrome(options=options)
# 进入网站,输入账号信息,进行登录
web.get('https://passport.lagou.com/login/login.html?msg=needlogin&clientIp=106.109.67.203')
web.maximize_window()
web.find_element('css selector','#lg-passport-box > div > div.sc-khQegj.dccqlI > div > div.sc-ksdxgE.gICVrK > div > div.sc-fotOHu.fLmipN > div:nth-child(1) > input').send_keys('15974880964')
web.find_element('css selector','#lg-passport-box > div > div.sc-khQegj.dccqlI > div > div.sc-ksdxgE.gICVrK > div > div.sc-fotOHu.fLmipN > div:nth-child(2) > input').send_keys('输入密码')
web.find_element('css selector','#lg-passport-box > div > div.sc-khQegj.dccqlI > div > div.sc-gKclnd.hrZfTH > div.sc-iCfMLu.eKQdwl > div').click()
web.find_element('css selector','#lg-passport-box > div > div.sc-khQegj.dccqlI > div > div.sc-fFeiMQ.hmmuaS > button > span').click()
time.sleep(15)
web.implicitly_wait(10)
web.find_element('css selector','#search_input').send_keys('人工智能')
web.get('https://www.lagou.com/wn/zhaopin?fromSearch=true&kd=%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD&city=%E5%85%A8%E5%9B%BD')
data_list=[]
for i in range(1,2):
infos=web.find_elements('class name',"item__10RTO")
for info in infos:
company=info.find_element('class name','company__2EsC8').text
work=info.find_element('tag name','a').text
salary=info.find_element('class name',"money__3Lkgq").text
special=info.find_element('css selector','.item-bom__cTJhu').text
dit={
'公司':company,
'工作':work,
'工资':salary,
'其他':special,
}
data_list.append(dit)
print(dit)
web.find_element('css selector','#jobList > div.pagination__1L2PP > ul > li.lg-pagination-next > a').click()
print('下一页')
csv_file='work_csv'
csv_headers=['公司','工作','工资','其他']
with open(csv_file,'w',newline='',encoding='utf-8')as file:
writer=csv.DictWriter(file,fieldnames=csv_headers)
writer.writeheader()
for data in data_list:
writer.writerow(data)
print('数据已保存')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams ## run command settings for plotting
config = {
"mathtext.fontset":'stix',
"font.family":'serif',
"font.serif": ['SimHei'],
"font.size": 10, # 字号,大家自行调节
'axes.unicode_minus': False # 处理负号,即-号
}
rcParams.update(config) ## 设置画图的一些参数
# CSV文件路径
csv_file_path = 'C:\\Users\\14774\\Documents\\output_table.csv'
# 读取CSV文件
df = pd.read_csv(csv_file_path)
# 假设第三列是工资范围
salary_range = df['工资'].tolist()
# 定义一个函数,用于将薪资范围转换为数值类型
def convert_salary_range(salary_str):
if pd.isna(salary_str): # 如果单元格是NaN,则返回None
return None
salary_str = salary_str.replace('k', '000').replace('K', '000') # 将k替换为000
salary_str = salary_str.strip('"') # 去除双引号
salary_range = salary_str.split('-')
if len(salary_range) == 2:
min_salary = float(salary_range[0])
max_salary = float(salary_range[1])
return min_salary, max_salary
else:
return None
# 转换工资范围为数值类型
salary_min = [convert_salary_range(salary)[0] for salary in salary_range if convert_salary_range(salary) is not None]
# 计算不同工资范围的频数
salary_bins = pd.cut(salary_min, bins=[0, 5000, 10000, 15000, 20000, 25000], right=False)
salary_count = salary_bins.value_counts()
# 将Series转换为一维数组
salary_count_values = salary_count.values
# 绘制饼图
plt.figure(figsize=(6, 6))
plt.pie(salary_count_values, labels=salary_count.index, autopct='%1.1f%%', startangle=140, wedgeprops=dict(width=0.4))
plt.title('工资范围组成比例',color='red')
plt.axis('equal') # 使饼图比例相等
# 绘制工资范围的直方图
plt.figure(figsize=(6, 4))
sns.histplot(data=salary_min, bins=20)
plt.title('工资范围分布',color='red')
plt.xlabel('最低工资 (元)',color='red')
plt.ylabel('频数',color='red')
plt.show()