爬取拉勾网职位信息

最新推荐文章于 2024-04-08 08:44:13 发布

在梦里翻山越岭

最新推荐文章于 2024-04-08 08:44:13 发布

阅读量147

点赞数 1

分类专栏：爬虫

本文链接：https://blog.csdn.net/qq_42285296/article/details/93937025

版权

爬虫专栏收录该内容

0 篇文章 0 订阅

订阅专栏

话不多说，直接上代码

# coding=utf-8
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import  By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import re
import json
class LaGou(object):

	def __init__(self):
		self.driver = webdriver.Chrome()
		self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
		self.poistion_li = []

	def parse_page_list(self,page_souce):
		html = etree.HTML(page_souce)
		a_list = html.xpath("//a[@class='position_link']/@href")
		for a in a_list:
			self.request_detail_page(a)
			time.sleep(1)
	def request_detail_page(self,url):
		# 把详情页的窗口打开到一个新页面
		self.driver.execute_script('window.open("%s")'%url)
		# 切换到详情页的窗口
		self.driver.switch_to.window(self.driver.window_handles[1])
		# 获取详情页的响应
		souce = self.driver.page_source
		WebDriverWait(self.driver,timeout=10).until(
			EC.presence_of_element_located((By.XPATH,'//div[@class="job-name"]'))
		)
		# 解析详情页
		self.parse_detail_page(souce)
		# 关闭详情页的窗口
		self.driver.close()
		# 切换到新页面
		self.driver.switch_to.window(self.driver.window_handles[0])

	def parse_detail_page(self,souce):
		html=etree.HTML(souce)
		# xpath哪怕取到一个元素也是列表 需要通过[0]来获取具体的字符串 字符串都能用strip方法取
		job_name=html.xpath('//div[@class="job-name"]/@title')[0].strip()
		job_request=html.xpath('//dd[@class="job_request"]/p')[0]
		salary=job_request.xpath('./span[1]//text()')[0].strip()
		salary=re.sub(r'[\s/]', '', salary)
		area=job_request.xpath('./span[2]//text()')[0].strip()
		area=re.sub(r'[\s/]', '', area)
		expirence=job_request.xpath('./span[3]//text()')[0].strip()
		expirence=re.sub(r'[\s/]', '', expirence)
		poistion_y=html.xpath('//dd[@class="job-advantage"]/p//text()')[0].strip()
		job_detail=html.xpath('//div[@class="job-detail"]//text()')
		job_detail=''.join(job_detail).strip()
		job_detail=re.sub(r'[\s/]', '', job_detail)
		company_addr=html.xpath("//div[@class='work_addr']/a//text()")
		company_addr='-'.join(company_addr)
		item={"job_name": job_name, "salary": salary, "area": area, "exprience":
			expirence, "position_y":
			poistion_y, "job_detail":
			job_detail, "company_addr":
			company_addr}
		self.poistion_li.append(item)
		self.save_content(self.poistion_li)

	def save_content(self, item):
		json_content=json.dumps(item, ensure_ascii=False, indent=3)
		with open('拉钩职位信息2.csv', 'a', encoding='utf-8') as f:
			f.write(json_content + '\n')
	def get_index(self):
		self.driver.get(self.url)
		self.driver.find_element_by_class_name('login').click()
		self.driver.find_element_by_class_name('input').send_keys('xxxxxxx')
		self.driver.find_element_by_xpath('//div[@class="input_item clearfix"]/input').send_keys('xxxx')
		self.driver.find_element_by_xpath('//div[contains(@class,"sense_login_password")]').click()
		time.sleep(5)
	def run(self):
		# 一开始先请求第一页
		self.get_index()
		while True:
			page_souce = self.driver.page_source
			detail_url = self.parse_page_list(page_souce)
			self.request_detail_page(detail_url)
			# 显示等待 找到这个标签 页面获取响应需要时间 防止时间出错
			WebDriverWait(self.driver,timeout=10).until(
				EC.presence_of_element_located((By.XPATH,'//div[@class="pager_container"]/span[last()]'))
			)
			try:
				# 获取下一页的地址
				next_btn = self.driver.find_element_by_xpath(
					'//div[@class="pager_container"]/span[last()]')
				if "pager_next_disabled" in next_btn.get_attribute('class'):
					break
				else:
					# 点击的时候就请求到了下一页
					next_btn.click()
			except Exception as e:
				print(page_souce)
			time.sleep(1)




if __name__ == '__main__':
	spider =LaGou()
	spider.run()