自己捣鼓了几天写的代码,基本上把51job的岗位相关的数据都爬下来了,可以视要求自行增减,代码虽然有些简陋,不过我爬取的时候没报什么错。代码适合初学者学习使用,废话不多说,代码如下:
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver
from time import sleep
import pymysql
import re
class Crawler:
def __init__(self):
self.wd = webdriver.Chrome()
self.wd.implicitly_wait(20)
self.DBHOST = "localhost"
self.DBUSER = "root"
self.DBPASS = "123456"
self.DBNAME = "51job"
# 获取当前页面的数据
def getData(self, len_Css):
rows = []
for i in range(1, len_Css):
# 岗位名称
job_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.jname.at'.format(i)).text
# 公司名称
company_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).text
# 城市 工作经验 学历 招聘人数
al = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.d.at'.format(i)).text.split('|')
# 分别对应不同情况,有的岗位缺少学历,有的缺少工作经验
if len(al) == 4:
city = al[0]
experience = al[1]
education = al[2]
recruits_Number = al[3]
elif len(al) == 3:
city = al[0]
experience = al[1]
education = None
recruits_Number = al[2]
elif len(al) == 2:
city = al[0]
experience = None
education = None
recruits_Number = al[1]
else:
city = None
experience = None
education = None
recruits_Number = None
# 发布日期
release_Date = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.time'.format(i)).text
# 公司福利
# 有的岗位不能定位到福利元素,通过自定义NoExists方法判断能否定位到元素
# if self.NoExists('div.j_joblist > div:nth-child({0}) p.tags'.format(i)):
# welfare = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.tags'.format(i)).get_attribute("title")
# else:
# welfare = None
# 薪水
# 有的岗位薪水能定位到元素,但是是空串,防止报错
if bool(self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text):
salary = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text
else:
salary = None
# 公司类型
company_type = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.int.at'.format(i)).text
# 招聘详情url
job_ex_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.el[target=_blank]'.format(i)).get_attribute("href")
# 公司url
company_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).get_attribute("href")
rows.append([job_name, company_name, city, experience, education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url])
return rows
# 将爬取的数据存进数据库
def saveData(self, rows):
db = pymysql.connect(host=self.DBHOST, user=self.DBUSER, password=self.DBPASS, database=self.DBNAME)
cur = db.cursor()
sql = "INSERT INTO ods_51job_job(job_name, company_name, job_city, job_experience, job_education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url) " \
"VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
for row in rows:
cur.execute(sql, row)
db.commit()
except pymysql.Error as e:
print(e)
finally:
cur.close()
db.close()
# 一次爬取存储一页数据,自动递增直到爬完
def scrapingData(self, City, keyWord, start_Page):
wait = WebDriverWait(self.wd, 20, 0.5)
# 得出总页数
isNextpage = self.wd.find_element(By.CSS_SELECTOR,
'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_page > div > div > div > span:nth-child(1)').text
result = re.findall(r'\d+', isNextpage)
condition = int(result[0])
sleep(2)
print('城市编号:%s 关键词:%s 总页数:%d' % (City, keyWord, condition))
while start_Page <= condition:
# 当前页面总共有多少条招聘岗位(一般是50条)
pubCss = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist > div.e')))
# 获取当前页面数据并存进数据库
rows1 = self.getData(len(pubCss)+1)
self.saveData(rows1)
print('\t已爬取第%d页;' % start_Page)
# 判断是否最后一页
if start_Page < condition:
nextpage = self.wd.find_element(By.CSS_SELECTOR, 'li.next a[style="cursor: pointer;"]')
nextpage.click()
self.wd.refresh()
start_Page += 1
else:
print('已爬取完当前城市关键词!')
break
sleep(2)
def NoExists(self, Css):
try:
self.wd.find_element(By.CSS_SELECTOR, Css)
return True
except NoSuchElementException:
return False
# 自动循环遍历城市和关键词
def getUrl(self, workCity, startPage, keywords):
# 爬取中断后需要更改i,j的下标初始位置和start_page重新继续爬取
for i in range(0, len(workCity)):
for j in range(0, len(keywords)):
suffix = str(
startPage) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
url = 'https://search.51job.com/list/' + str(
workCity[i]) + ',000000,0000,00,9,99,' + keywords[j] + ',2,' + suffix
self.wd.get(url)
self.scrapingData(workCity[i], keywords[j], startPage)
# 更改start_page后会从start_page页开始爬,爬到下一个关键词再把start_page重置成1从第一页开始爬
if startPage > 1:
startPage = 1
# 热门城市编码
# {"北京", "010000"}, {"上海", "020000"}, {"广州", "030200"}, {"深圳", "040000"}, {"武汉", "180200"},
# {"西安", "200200"}, {"杭州", "080200"}, {"南京", "070200"}, {"成都", "090200"}, {"重庆", "060000"},
# {"东莞", "030800"}, {"大连", "230300"}, {"沈阳", "230200"}, {"苏州", "070300"}, {"昆明", "250200"},
# {"长沙", "190200"}, {"合肥", "150200"}, {"宁波", "080300"}, {"郑州", "170200"}, {"天津", "050000"},
# {"青岛", "120300"}, {"哈尔滨", "220200"}, {"长春", "240200"}, {"福州", "110200"}, {"珠三角", "01"};
if __name__ == '__main__':
# 将需要爬取的城市编号和关键词放进数组,start_page为从第几页开始爬
cities = ['040000', '080200', '070200', '190200', '090200', '180200']
keyword = ['大数据', 'python', '爬虫', 'Hadoop', '数据分析师', 'Hadoop']
start_page = 1
a = Crawler()
a.getUrl(cities, start_page, keyword)
上面的代码里公司福利的数据我注释掉了,因为基本每页都有几条没有公司福利的岗位数据,处理错误耗时太久,爬取大量数据的时候太煎熬了,干脆不要了。还有就是css路径我都是直接复制的,好多都还可以再删减优化,不过我比较懒,也可以换成xpath路径,可以更精简。最后就是数据库需要自己建表,连接的时候注意改下代码里的参数还有sql里的字段名称就行,还是比较简单的。
我自己运行代码的时候出错一般都是爬了很久后报timeout错误,可以把等待时间稍微加长点,不过估计爬多了也还会报错,毕竟51job虽然很随便但爬多了也会反爬,只是不像boss直聘爬了几千条数据就封ip两天那么狠(表示被封过好几次😤),最后就是出错了需要手动重新更改参数继续爬,有些麻烦,还能再改进,不过我懒得改了,反正估计也没多少人看,自己能用就行啦。