简单的R语言KNN运用小项目

R语言KNN小项目

数据挖掘

用的是基础的scrapy这里爬取的是猎聘网的python相关职位的信息,只爬取了800多条信息

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from lagouwang.items import LagouwangItem
import re

class LgwSpider(scrapy.Spider):
    name = 'lgw'
    #allowed_domains = ['lgw.oi']
    start_urls = ['https://www.liepin.cn/zhaopin/?init=-1&headckid=6dc4dc9bb030cfe9&fromSearchBtn=2&ckid=6dc4dc9bb030cfe9&degradeFlag=0&key=python&siTag=I-7rQ0e90mv8a37po7dV3Q%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=4df0c063e2b6375f63373645d3c082fd&d_curPage=12&d_pageSize=40&d_headId=4df0c063e2b6375f63373645d3c082fd&curPage=12']

    def parse(self, response):
        S = BeautifulSoup(response.body,"html.parser")
        List = S.find("ul",class_="sojob-list").find_all("div","job-info")
        item = LagouwangItem()
        for spa in List:
            a = spa.find_all('a')
            span = spa.find_all('span')

            item['Salary'] = span[0].string
            print(item['Salary'])
            item['Experience'] = span[2].string
            print(item['Experience'])
            item['Education'] = span[1].string
            print(item['Education'])
            try:

                item['Post'] = a[0].string.strip()
                item['Address'] = a[1].string

            except:
                continue
            yield item
        before = response.url
        p = r"curPage=(\d)"
        page = re.search(p,before).group(1)
        page = int(page)+1
        if page < 200:
            nexturl = 'https://www.liepin.cn/zhaopin/?init=-1&headckid=6dc4dc9bb030cfe9&fromSearchBtn=2&ckid=6dc4dc9bb030cfe9&degradeFlag=0&key=python&siTag=I-7rQ0e90mv8a37po7dV3Q%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=4df0c063e2b6375f63373645d3c082fd&d_curPage='+str(page)+'&d_pageSize=40&d_headId=4df0c063e2b6375f63373645d3c082fd&curPage='+str(page)
            yield scrapy.Request(nexturl,callback=self.parse)

这里附上爬取的原始数据截图

数据清洗

# -*- coding: utf-8 -*-
import pymysql
import re
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
from numpy.core._multiarray_umath import ndarray
from pandas import DataFrame,Series
import matplotlib.pyplot as plt
results= pd.read_excel("C:/Users/asus/Desktop/lgw.xlsx",encoding='unicode_escape')
results2=np.array(results)
results3=results2
k=0
for i in np.arange(0,816,1):
    pattern = re.compile(r'面议')
    if pattern.findall(np.str(results2[i][2:3])) ==['面议']:
        continue
    pattern = re.compile(r'经验不限')
    if pattern.findall(np.str(results2[i][3:4])) == ['经验不限']:
        continue
    pattern = re.compile(r'不限')
    if pattern.findall(np.str(results2[i][4:5])) == ['不限']:
        continue
    pattern = re.compile(r'北京|上海|广州|深圳|成都|杭州|重庆|武汉|苏州|西安|天津|南京|郑州|长沙|沈阳|青岛|宁波|东莞|无锡|厦门')
    if pattern.findall(np.str(results2[i][1:2])) == []:
        continue
    results3[k][1:2]=pattern.findall(np.str(results2[i][1:2]))[0]
    pattern = re.compile(r'\d{1,2}')
    results3[k][2:3] = np.str(((int(pattern.findall(np.str(results2[i][2:3]))[0])+int((pattern.findall(np.str(results2[i][2:3]))[1])))/2)*int(pattern.findall(np.str(results2[i][2:3]))[2]))
    pattern = re.compile(r'\d')
    results3[k][3:4] = pattern.findall(np.str(results2[i][3:4]))
    pattern = re.compile(r'博士|硕士|本科|大专')
    results3[k][4:5] = pattern.findall(np.str(results2[i][4:5]))
    print(results3[k])
    print(k)
    k=k+1
results3.tolist()
import csv
f = open('data.csv','w',encoding='utf-8')
csv_writer = csv.writer(f)
csv_writer.writerow(["工作地点","薪水","经验","学历"])
k=0
for i in results3:
   if k >574:
       break
   csv_writer.writerow([i[1],i[2],i[3],i[4]])
   k+=1
f.close()
#

清理后的数据

R语言KNN分析

没法附代码,就放个截图吧
在这里插入图片描述

  • 5
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值