1、简介
在爬取之前,我先用requests包get了一下拉勾的官网,发现源码并没有包含我想要的招聘信息,这时候就麻烦,当然解决的方法有很多种,可以抓包得到招聘信息的json数据,通过解析json来获取招聘信息,另外还可以通过PhantomJS来伪装浏览器获取,PhantomJS的安装我就不多说了,大家可以自行度娘。接下来我就告诉大家如何爬取拉勾。
2、爬取招聘信息的网站
首先大家进入拉勾官网,我随便输入了数据分析这个岗位,就会出现如下页面:
这时候我们发现虽然有大部分招聘信息但是还不是很全,而我们需要爬的信息是点开链接后的信息
所以我们得找到这个链接对不对,这个很好找,通过检查元素我们很容易获取,如下
想要定位这个链接,用bs4很容易:
注意用到了队列,全部代码会放在后面。
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
for i in urllinks:
print(i.get('href'))
que.put(i.get('href'))
3、爬取信息
当我们获得了这些链接后,后续就简单了,只需要对这些链接解析就行了,具体的代码如下:
注意:我把数据放在了mysql里面,当然你也可以直接放到本地
driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
while que:
try :
newurl = que.get()
driver2.get(newurl)
driver2.implicitly_wait(100)
bs2 = BeautifulSoup(driver2.page_source)
job_info = bs2.find('div', class_='job-name')
company = job_info.find('div', class_='company')
reg1 = re.compile("<[^>]*>")
###部门
company = reg1.sub('', company.prettify())
####职位
job = job_info.find('span', class_='name')
reg2 = re.compile("<[^>]*>")
job = reg2.sub('', job.prettify()).strip('\n')
###工资 、地点 、经验、学历
job_req = bs2.find('dd', class_='job_request')
all_info = []
for i in job_req.find_all('span'):
reg3 = re.compile("<[^>]*>")
new_in = reg3.sub('', i.prettify())
all_info.append(new_in)
salary = all_info[0]
mod = re.compile('/')
salary = mod.sub('', salary).strip('\n')
address = all_info[1]
address = mod.sub('', address).strip('\n')
exp = all_info[2]
exp = mod.sub('', exp).strip('\n')
edu = all_info[3]
edu = mod.sub('', edu).strip('\n')
###job_detail
job_det = bs2.find('dl', class_='job_detail', id='job_detail')
###职位诱惑
job_lu = job_det.find('dd', class_='job-advantage').find('p')
reg4 = re.compile("<[^>]*>")
job_lu = reg4.sub('', job_lu.prettify())
###工作责任与要求
job_zong = job_det.find('dd', class_='job_bt')
job_res = job_zong.find('div')
reg5 = re.compile("<[^>]*>")
job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())
###工作地址
job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')
reg6 = re.compile("<[^>]*>")
job_ad = reg6.sub('', job_ad.prettify()).strip('\n')
job_con = bs2.find('dl', class_='job_company', id='job_company')
###公司名称
com_name = job_con.find('dt').find('a').find('img').get('alt')
###公司类型
com_cat = job_con.find('ul', class_='c_feature').find_all('li')
all_info2 = []
for i in com_cat:
reg7 = re.compile("<[^>]*>")
new_in = reg7.sub('', i.prettify())
all_info2.append(new_in)
com_cat = all_info2[0].strip('\n')
lingyu = '领域'
dev = '发展阶段'
gui ='规模'
a1 = re.compile(lingyu)
a2 = re.compile(dev)
a3 = re.compile(gui)
com_cat = a1.sub('',com_cat).strip()
com_qua = all_info2[1].strip('\n')
com_qua = a2.sub('',com_qua).strip()
com_peo = all_info2[2].strip('\n')
com_peo = a3.sub('',com_peo).strip()
print(company, job, salary, address, exp, edu, job_lu, job_res, job_ad, com_name, com_cat, com_qua, com_peo)
db = pymysql.connect('localhost', 'root', '*********', 'test')
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))
db.commit()
cursor.close()
db.close()
except:
print('该页面无法获取')
4、结果:
结果如下:
5、全部代码
from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
import re
what1 = '数据分析'
what2 ='全职'
what3 = '广州'
what1 = urllib.parse.quote(what1)
what2 = urllib.parse.quote(what2)
what3 = urllib.parse.quote(what3)
# driver=webdriver.PhantomJS()
driver=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
url = 'https://www.lagou.com/jobs/list_%s?px=default&gx=%s&city=%s#order' % (what1,what2,what3)
url2 = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'
driver.implicitly_wait(100)
driver.get(url)
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
import queue
que = queue.Queue()
for i in urllinks:
print(i.get('href'))
que.put(i.get('href'))
link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
link_next.click()
times = 0
while True:
times += 1
driver.implicitly_wait(10)
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
for i in urllinks:
print(i.get('href'))
que.put(i.get('href'))
print(times)
if times == 9:
break
link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
link_next.click()
sleep(3)
# driver2 = webdriver.PhantomJS()
driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
while que:
try :
newurl = que.get()
driver2.get(newurl)
driver2.implicitly_wait(100)
bs2 = BeautifulSoup(driver2.page_source)
job_info = bs2.find('div', class_='job-name')
company = job_info.find('div', class_='company')
reg1 = re.compile("<[^>]*>")
###部门
company = reg1.sub('', company.prettify())
####职位
job = job_info.find('span', class_='name')
reg2 = re.compile("<[^>]*>")
job = reg2.sub('', job.prettify()).strip('\n')
###工资 、地点 、经验、学历
job_req = bs2.find('dd', class_='job_request')
all_info = []
for i in job_req.find_all('span'):
reg3 = re.compile("<[^>]*>")
new_in = reg3.sub('', i.prettify())
all_info.append(new_in)
salary = all_info[0]
mod = re.compile('/')
salary = mod.sub('', salary).strip('\n')
address = all_info[1]
address = mod.sub('', address).strip('\n')
exp = all_info[2]
exp = mod.sub('', exp).strip('\n')
edu = all_info[3]
edu = mod.sub('', edu).strip('\n')
###job_detail
job_det = bs2.find('dl', class_='job_detail', id='job_detail')
###职位诱惑
job_lu = job_det.find('dd', class_='job-advantage').find('p')
reg4 = re.compile("<[^>]*>")
job_lu = reg4.sub('', job_lu.prettify())
###工作责任与要求
job_zong = job_det.find('dd', class_='job_bt')
job_res = job_zong.find('div')
reg5 = re.compile("<[^>]*>")
job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())
###工作地址
job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')
reg6 = re.compile("<[^>]*>")
job_ad = reg6.sub('', job_ad.prettify()).strip('\n')
job_con = bs2.find('dl', class_='job_company', id='job_company')
###公司名称
com_name = job_con.find('dt').find('a').find('img').get('alt')
###公司类型
com_cat = job_con.find('ul', class_='c_feature').find_all('li')
all_info2 = []
for i in com_cat:
reg7 = re.compile("<[^>]*>")
new_in = reg7.sub('', i.prettify())
all_info2.append(new_in)
com_cat = all_info2[0].strip('\n')
lingyu = '领域'
dev = '发展阶段'
gui ='规模'
a1 = re.compile(lingyu)
a2 = re.compile(dev)
a3 = re.compile(gui)
com_cat = a1.sub('',com_cat).strip()
com_qua = all_info2[1].strip('\n')
com_qua = a2.sub('',com_qua).strip()
com_peo = all_info2[2].strip('\n')
com_peo = a3.sub('',com_peo).strip()
print(company, job, salary, address, exp, edu, job_lu, job_res, job_ad, com_name, com_cat, com_qua, com_peo)
db = pymysql.connect('localhost', 'root', '*********', 'test')
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))
db.commit()
cursor.close()
db.close()
except:
print('该页面无法获取')