R语言KNN小项目
数据挖掘
用的是基础的scrapy这里爬取的是猎聘网的python相关职位的信息,只爬取了800多条信息
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from lagouwang.items import LagouwangItem
import re
class LgwSpider(scrapy.Spider):
name = 'lgw'
#allowed_domains = ['lgw.oi']
start_urls = ['https://www.liepin.cn/zhaopin/?init=-1&headckid=6dc4dc9bb030cfe9&fromSearchBtn=2&ckid=6dc4dc9bb030cfe9°radeFlag=0&key=python&siTag=I-7rQ0e90mv8a37po7dV3Q%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=4df0c063e2b6375f63373645d3c082fd&d_curPage=12&d_pageSize=40&d_headId=4df0c063e2b6375f63373645d3c082fd&curPage=12']
def parse(self, response):
S = BeautifulSoup(response.body,"html.parser")
List = S.find("ul",class_="sojob-list").find_all("div","job-info")
item = LagouwangItem()
for spa in List:
a = spa.find_all('a')
span = spa.find_all('span')
item['Salary'] = span[0].string
print(item['Salary'])
item['Experience'] = span[2].string
print(item['Experience'])
item['Education'] = span[1].string
print(item['Education'])
try:
item['Post'] = a[0].string.strip()
item['Address'] = a[1].string
except:
continue
yield item
before = response.url
p = r"curPage=(\d)"
page = re.search(p,before).group(1)
page = int(page)+1
if page < 200:
nexturl = 'https://www.liepin.cn/zhaopin/?init=-1&headckid=6dc4dc9bb030cfe9&fromSearchBtn=2&ckid=6dc4dc9bb030cfe9°radeFlag=0&key=python&siTag=I-7rQ0e90mv8a37po7dV3Q%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=4df0c063e2b6375f63373645d3c082fd&d_curPage='+str(page)+'&d_pageSize=40&d_headId=4df0c063e2b6375f63373645d3c082fd&curPage='+str(page)
yield scrapy.Request(nexturl,callback=self.parse)
数据清洗
# -*- coding: utf-8 -*-
import pymysql
import re
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
from numpy.core._multiarray_umath import ndarray
from pandas import DataFrame,Series
import matplotlib.pyplot as plt
results= pd.read_excel("C:/Users/asus/Desktop/lgw.xlsx",encoding='unicode_escape')
results2=np.array(results)
results3=results2
k=0
for i in np.arange(0,816,1):
pattern = re.compile(r'面议')
if pattern.findall(np.str(results2[i][2:3])) ==['面议']:
continue
pattern = re.compile(r'经验不限')
if pattern.findall(np.str(results2[i][3:4])) == ['经验不限']:
continue
pattern = re.compile(r'不限')
if pattern.findall(np.str(results2[i][4:5])) == ['不限']:
continue
pattern = re.compile(r'北京|上海|广州|深圳|成都|杭州|重庆|武汉|苏州|西安|天津|南京|郑州|长沙|沈阳|青岛|宁波|东莞|无锡|厦门')
if pattern.findall(np.str(results2[i][1:2])) == []:
continue
results3[k][1:2]=pattern.findall(np.str(results2[i][1:2]))[0]
pattern = re.compile(r'\d{1,2}')
results3[k][2:3] = np.str(((int(pattern.findall(np.str(results2[i][2:3]))[0])+int((pattern.findall(np.str(results2[i][2:3]))[1])))/2)*int(pattern.findall(np.str(results2[i][2:3]))[2]))
pattern = re.compile(r'\d')
results3[k][3:4] = pattern.findall(np.str(results2[i][3:4]))
pattern = re.compile(r'博士|硕士|本科|大专')
results3[k][4:5] = pattern.findall(np.str(results2[i][4:5]))
print(results3[k])
print(k)
k=k+1
results3.tolist()
import csv
f = open('data.csv','w',encoding='utf-8')
csv_writer = csv.writer(f)
csv_writer.writerow(["工作地点","薪水","经验","学历"])
k=0
for i in results3:
if k >574:
break
csv_writer.writerow([i[1],i[2],i[3],i[4]])
k+=1
f.close()
#
R语言KNN分析