from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
driver_path = r"/home/charging/chromedriver"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
self.positions = []
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source # 解析当前这一页
WebDriverWait(driver=self.driver,timeout=10).until(EC.presence_of_all_elements_located((By.XPATH,
"//div[@class='pager_container']/span[@action='next']")))
self.parse_list_page(source)
# 找到当前这页中的'下一页'
next_button = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[@action='next']")
# 判断是否为最后一页
if 'pager_next pager_next_disabled' in next_button.get_attribute("class"):
break
else:
# next_button.click() # 该方法不可行
self.driver.execute_script("arguments[0].click();", next_button)
time.sleep(1) # 避免爬取过快,爬取完每一页
# 所有职位信息的连接列表
def parse_list_page(self, source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links: # 获取每个职位链接的集合
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self, url):
# self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url)
self.driver.switch_to_window(self.driver.window_handles[1])
WebDriverWait(self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/h1[@class='name']")))
source = self.driver.page_source # 获取网页源码
self.parse_detail_page(source)
self.driver.close() # 关闭当前详情窗口
self.driver.switch_to_window(self.driver.window_handles[0]) # 切换到职位列表页
# 获取详细信息
def parse_detail_page(self, source):
html = etree.HTML(source) # 上一个方法 requests,传入response。text
position_name = html.xpath
# 获取职位名称
position_name = html.xpath("//h1[@class='name']//text()")[0]
# 获取职位地址/薪资/经验...
job_request_spans = html.xpath("//dd[@class='job_request']//span")
# 薪资
salary = job_request_spans[0].xpath('.//text()')[0].strip() # 取第一个元素后还需要解析 strip()取出字符串中空白
# 城市
city = job_request_spans[1].xpath('.//text()')[0].strip() # /深圳 / 为了取出反斜杠和空格,正则表达式
city = re.sub(r"[\s/]", "", city) # 将空白符(空格/制表符)替换成空字符串
# 工作年限
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]", "", work_years)
# 学历
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]", "", education)
# 公司
company = html.xpath("//h3[@class='fl']//em/text()")[0].strip()
# 职位描述
job_desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() # 返回的是一个列表,用join组合成一个字符串,进行规则排序
# 将每个职位信息整合成一个字典
position = {
'name': position_name,
'company': company,
'salary': salary,
'city': city,
'work_year': work_years,
'education': education,
'job_desc': job_desc
}
# 将所有的字典追加到列表
self.positions.append(position)
print(position)
print('-'*20)
def main():
s = LagouSpider()
s.run()
if __name__ == '__main__':
main()
selenium 爬取拉勾网
最新推荐文章于 2021-07-30 23:49:35 发布