这次爬取拉勾网的职位数据,进入拉勾官网,在搜索框中输入python,java不难发现跳转之后的url为
可以构造出如下的url
word=input("请输入职位:")
url="https://www.lagou.com/jobs/list_{}?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=".format(word)
搜索之后点击下一页页面url没变化,不难发现数据是由ajax请求获取的,当然我试过直接访问其接口数据,但失败了,所以我使用selenium模拟点击下一页获取数据
当点击到第30页时,下一页无法继续点击,右键检查元素,发现class属性多了个pager_next_disabled,通过判断节点class属性是否包含pager_next_disabled来判断节点是否可以继续点击
相关代码如下
def get_url(driver,word):
while True:
get_detail(driver,word) #打印并保存数据
time.sleep(1)
next_pager=driver.find_element_by_class_name("pager_next")
if next_pager:
class_value=next_pager.get_attribute("class")
if "pager_next_disabled" not in class_value: #如果不在,则点击
next_pager.click()
else:
return
else:
return
剩下用BeautifulSoup和正则表达式提取数据和简单的清洗数据我就不说了
完整代码如下
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import re
#使用selenium模拟点击下一页,直到下一页不能再点击为止
def get_url(driver,word):
while True:
get_detail(driver,word)
time.sleep(1)
next_pager=driver.find_element_by_class_name("pager_next")
if next_pager:
class_value=next_pager.get_attribute("class")
if "pager_next_disabled" not in class_value:
next_pager.click()
else:
return
else:
return
#打印每页的数据,并将数据保存到txt文件中
def get_detail(driver,word):
time.sleep(1)
soup=BeautifulSoup(driver.page_source,'lxml')
lis=soup.select(".con_list_item")
with open("{}.txt".format(word),'a',encoding="utf-8") as f:
for li in lis:
job_name=re.findall(r'<h3.*?>(.*?)</h3>',str(li),re.S)
city=re.findall(r'class="add".*?<em>(.*?)</em>',str(li),re.S)
start_time=re.findall(r'class="format-time">(.*?)</span>',str(li),re.S)
money=re.findall(r'class="money">(.*?)</span>',str(li),re.S)
experience=re.findall(r'money.*?-->(.*?)</div>',str(li),re.S)
company=re.findall(r'company_name">.*?>(.*?)</a>',str(li),re.S)
industry=re.findall(r'class="industry">(.*?)</div>',str(li),re.S)
welfare=re.findall(r'class="li_b_r">(.*?)</div>',str(li),re.S)
print("工作名称:",job_name[0])
print("工作地点:",city[0])
print("发布时间:",start_time[0])
print("工资:",money[0])
print("经验:",experience[0].strip())
print("公司:",company[0])
print("公司规模:",industry[0].strip())
print("福利:",welfare[0])
print('............................')
f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(job_name[0],city[0],start_time[0],money[0],experience[0].strip(),company[0],industry[0].strip(),welfare[0]))
if __name__=='__main__':
word=input("请输入职位:")
url="https://www.lagou.com/jobs/list_{}?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=".format(word)
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window()
get_url(driver,word)
数据截图如下: