利用python爬取在前程无忧网搜索python关键字出现的最新的招聘数据,保存到本地Excel,进行数据查看和预处理,然后利用matplotlib进行数据分析和可视化。
1. 爬取数据
目标url:https://www.51job.com/
在前程无忧网输入关键字python,搜索有关的岗位数据。翻页查看这些招聘岗位信息,可以发现url翻页的规律。
检查网页源代码,可以找到想要提取的数据。
爬虫代码如下:
import asyncio
import aiohttp
import logging
import datetime
import re
import pandas as pd
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
start = datetime.datetime.now()
class Spider(object):
def __init__(self):
self.semaphore = asyncio.Semaphore(6)
self.headers = {
'Connection': 'Keep-Alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Host': 'search.51job.com',
'Referer': 'https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24'
}
async def scrape(self, url):
async with self.semaphore:
session = aiohttp.ClientSession(headers=self.headers)
response = await session.get(url)
await asyncio.sleep(1)
result = await response.text()
await session.close()
return result
async def scrape_index(self, page):
url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
text = await self.scrape(url)
await self.parse(text)
await asyncio.sleep(1)
async def parse(self, text):
# 正则匹配提取数据
try:
job_name = re.findall('"job_name":"(.*?)",', text) # 职位
company_name = re.findall('"company_name":"(.*?)",', text) # 公司名称
salary = re.findall('"providesalary_text":"(.*?)",', text)
salary = [i.replace('\\', '') for i in salary] # 薪酬 去掉 \ 符号
city = re.findall('"workarea_text":"(.*?)",', text) # 城市
job_welfare = re.findall('"jobwelf":"(.*?)",', text) # 职位福利
attribute_text = re.findall('"attribute_text":(.*?),"companysize_text"', text)
attribute_text = ['|'.join(eval(i)) for i in attribute_text]
companysize = re.findall('"companysize_text":"(.*?)",', text) # 公司规模
category = re.findall(