爬取网站相关职位信息的数据分析
分析背景:了解相关职位的市场需求量以及职位的发展要求,匹配该职位的地区分布, 学历要求, 经验要求, 薪资水平等,对求职人员和进入该职业的新人提供一些帮助和大致的学习方向
分析流程:
1、数据获取(爬取有关大数据职位的信息)
2、数据清洗
3、数据分析
4、数据可视化
流程图:
数据获取(python爬虫)
利用python+selenium自动化爬取网站职位信息
主要代码
from selenium import webdriver
import requests
import re
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import time
from fake_useragent import UserAgent
import requests.exceptions
def get_url(html):
r = '<a target="_blank" title="(.*?)" href="(.*?)".*?>.*?class="t3">(.*?)<.*?class="t4">(.*?)<.*?class="t5">(.*?)</span>'
list1 = re.findall(re.compile(r, re.S), html)
return list1
proxy=None
proxy_url="http://127.0.0.1:5000/get"
max_count=5
def get_proxy():
global proxy
try:
response=requests.get(proxy_url)
if response.status_code==200:
return response.text
return None
except ConnectionError:
time.sleep(20)
get_proxy()
def get_zhiwei(url):
global proxy
ua = UserAgent(path="F:\pycharm\JOB/fake_useragent_0.1.11.json")
headers = {
"User-Agent": ua.random
}
try:
if proxy:
print("使用代理中")
proxies={
"http":"http://"+proxy
}
html=requests.get(url,headers=headers,proxies=proxies)
html.encoding = html.apparent_encoding
else:
html=requests.get(url,headers=headers)
html.encoding = html.apparent_encoding
if html.status_code == 200:
r = '<div class="bmsg job_msg inbox">(.*?)<div class="mt10"'
zhiwei_list = re.findall(re.compile(r, re.S), html.text)
if zhiwei_list:
return zhiwei_list
else:
return ["null"]
else :
proxy=get_proxy()
if proxy:
print("使用代理",proxy)
get_zhiwei(url)
else:
return ['null']
except ConnectionError:
time.sleep(10)
return ['null']
def ord_zhiwei(url):
ua = UserAgent(path="F:\pycharm\JOB/fake_useragent_0.1.11.json")
headers = {
"User-Agent": ua.random
}
try:
html = requests.get(url, headers=headers)
html.encoding = html.apparent_encoding
r = '<div class="bmsg job_msg inbox">(.*?)<div class="mt10"'
zhiwei_list = re.findall(re.compile(r, re.S), html.text)
if zhiwei_list:
return zhiwei_list
else:
return ["null"]
except requests.exceptions as e:
print(e.response)
def save_sql(list1):
connect = pymysql.connect("localhost", "root", "123456", "zhiwei", charset='utf8')
cursor = connect.cursor()
sql1 = """CREATE TABLE IF NOT EXISTS job_2(
name varchar(100),
web varchar(100),
city varchar(20),
salary varchar(20),
date varchar(20),
job_description varchar(10000))CHARSET=utf8"""
sql2 = """INSERT INTO job_2(name,web,city,salary,date,job_description) VALUES(%s,%s,%s,%s,%s,%s)"""
try:
cursor.execute(sql1)
cursor.executemany(sql2, list1)
connect.commit()
except:
connect.rollback()
connect.close()
def etl_data(str):
str = str.replace("\r","").replace("\n","").replace("\t","").replace("<p>","") \
.replace("</p>","").replace("<span>","").replace("</span>","").replace("<br>","") \
.replace("<br/>","").replace(" ",