for i in range(1,2001):
#增加时延防止反爬虫
time.sleep(5)
url = url_pattern.format(i)
response = requests.get(url=url, headers=headers)
#声明网页编码方式,需要根据具体网页响应情况
response.encoding = ‘gbk’
response.raise_for_status()
soup = BeautifulSoup(response.text, ‘html.parser’)
解析
for i in soup.find_all(lambda tag: tag.name==‘div’ and tag.get(‘class’)==[‘el’])[4:]:
job = i.find(‘p’,class_=‘t1’).a[‘title’]
company = i.find(‘span’,class_=‘t2’).a[‘title’]
place = i.find(‘span’,class_=‘t3’).get_text()
salary = i.find(‘span’,class_=‘t4’).get_text()
date = i.find(‘span’,class_=‘t5’).get_text()
detail_url = i.find(‘p’,class_=‘t1’).a[‘href’]
with open(‘intro_job.csv’, ‘a+’, encoding=‘utf-8-sig’) as f:
f.write(job + ‘,’ + company + ‘,’ + place + ‘,’ + salary + ‘,’ + date +‘,’ + detail_url + ‘\n’)
爬取数据结果
展示部分爬取结果:
数据预处理阶段主要为了去除不完整的数据,例如有些职务的薪资未明确标出,可以采用丢弃此条数据的方式,或者使用全局平均值之类的处理方法,这里采用直接丢弃的方法。
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import math
import re
#读取数据
df = pd.read_csv(‘intro_job.csv’, encoding=‘utf-8-sig’,usecols=[“job”, “company”, “place”, “salary”, “date”])
#将相应字段数据保存至列表中
job_array = df[‘job’].values
company_array = df[‘company’].values
place_array = df[‘place’].values
salary_array = df[‘salary’].values
date_array = df[‘date’