Python对爬取51job详情进行数据清洗(2)

数据分析与可视化地址:https://blog.csdn.net/weixin_43746433/article/details/91349199

1.查看数据,并对数据进行去重,去除任意的缺失值,提取关键字等操作

import pandas as pd
import re
def clean_data_quchong_key():
    data=pd.read_csv('test_datasets.csv',delimiter='#',header=0)
    df=pd.DataFrame(data)
    print(df.shape)
    print(df.columns)
	print(df.head())
    df=df.dropna()##去除有任意的缺失值
    print(df.shape)
    #print(df.head())
    print('去重之前形状',df.shape)
    df=df.drop_duplicates(subset='companyname')#去重
    print('去重之后形状',df.shape)
    df=df[df[r'describe'].str.contains(r'.*?数据.*?|.*?分析.*?|.*?python.*?')]#提取包含数据或者分析的岗位
    print('包含关键字',df.shape)
    #df=df.iloc[:,1]#提取包含数据或者分析的岗位
    df.to_csv('test_datasets_cleaned1.csv',sep='#')
#clean_data_quchong_key()

在这里插入图片描述
在这里插入图片描述

2.对文本内容(薪资,工作地点等)进行清洗

2.1查看数据

import pandas as pd
data=pd.read_csv('test_datasets.csv',delimiter='#',header=0)
df=pd.DataFrame(data)
print(df.shape)
df=df.loc[:,['salary','area']]
print(df)

在这里插入图片描述
2.2对薪资(统一格式),地区(提取关键地址)数据清洗

def get_salary(salary):
    if '-' in salary:  # 针对1-2万/月或者10-20万/年的情况,包含-
        low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
        high_salary = re.findall(re.compile('(\d?\.?\d+)'), salary)[1]
        if u'万' in salary and u'年' in salary:  # 单位统一成千/月的形式
            low_salary = float(low_salary) / 12 * 10
            high_salary = float(high_salary) / 12 * 10
        elif u'万' in salary and u'月' in salary:
            low_salary = float(low_salary) * 10
            high_salary = float(high_salary) * 10
    else:  # 针对20万以上/年和100元/天这种情况,不包含-,取最低工资,没有最高工资
        low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
        high_salary = ""
        if u'万' in salary and u'年' in salary:  # 单位统一成千/月的形式
            low_salary = float(low_salary) / 12 * 10
        elif u'万' in salary and u'月' in salary:
            low_salary = float(low_salary) * 10
        elif u'元' in salary and u'天' in salary:
            low_salary = float(low_salary) / 1000 * 21  # 每月工作日21天
    return low_salary, high_salary

def clean_data_salary():
    data = pd.read_csv('test_datasets_cleaned1.csv', delimiter='#', header=0)
    df = pd.DataFrame(data)
    datasets = pd.DataFrame()
    for index, row in df.iterrows():
        area = row["area"][:2]
        job = row["job"]
        companyname=row['companyname']
        salary = row["salary"]
        workyear = row["workyear"]
        if salary:#如果待遇这栏不为空,计算最低最高待遇
            getsalary=get_salary(salary)
            low_salary=getsalary[0]
            high_salary=getsalary[1]
        else:
            low_salary=high_salary=""
        try:
            salary = str(int(float(low_salary)) / 10) + '-' + str(int(float(high_salary)) / 10) + '万/月'   #统一薪资格式
            print(salary)
            print('正在写入第{}条,工资{},最低工资是{}k,最高工资是{}k'.format(index, salary, low_salary, high_salary))
        except Exception as e:
            print(e)
            continue
        education= row["education"]
        welfare= row["welfare"]
        companytype= row["companytype"]
        companyscale = row["companyscale"]
        describe = row["describe"]
        link = row["link"]
        datalist = [
            str(area),
            str(job),
            str(companyname),
            str(education),
            str(salary),
            str(low_salary),
            str(high_salary),
            str(workyear),
            str(welfare),
            str(companytype),
            str(companyscale),
            str(describe),
            str(link)]
        series = pd.Series(datalist, index=[
            'area',
            'job',
            'companyname',
            'education',
            'salary',
            'low_salary',
            'high_salary',
            'workyear',
            'welfare',
            'companytype',
            'companyscale',
            'describe',
            'link'
        ])
        series = pd.Series(series)
        datasets = datasets.append(series, ignore_index=True)
    print(datasets)
    datasets.to_csv('test_datasets_finally.csv', mode='a+', sep='#', index=False, index_label=False, encoding='utf-8')

#clean_data_salary()

在这里插入图片描述
2.3查看清洗过的数据

data=pd.read_csv('test_datasets_finally.csv',delimiter='#',header=0)
df=pd.DataFrame(data)
print(df.shape)
df=df.loc[:,['salary','low_salary','high_salary','area']]
print(df)

在这里插入图片描述
完成!

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值