之前使用requests库爬取拉勾网站招聘信息时,老是出现访问频繁或者因检测到存在爬虫行为被禁止访问。从开始学习爬虫到工作,爬取过一些网站。拉勾算是反爬虫措施做的最变态的,没有之一。
为了获取获取更加完整的数据信息,今天介绍使用selenium+chromdriver,借助工具进行爬取,以招聘python岗位为关键字。
爬取思路可分为以下几个步骤:
- 爬取第一页页面中所有职位的链接:每个职位有个id。
- 然后根据每个职位的链接,获取职位的详细信息。
- 右键——检查,根据html,选取元素。见代码。
- 第一页职位信息爬取完成后,让浏览器跳转到第二页,继续爬取每一个职位的详细信息,以此类推,直到最后一页。
- 爬取过程如下:
- 完整代码如下:
# project: seleniumChromedriver # author: liulei # time: 2019/8/6 9:39 # file: lagouSpider.py import random from selenium import webdriver from lxml import etree import time import re from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By class LagouSpider(object): driver_path = r'C:\chromedriver\chromedriver.exe' def __init__(self): self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path) self.url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC#filterBox' self.positions = [] def run(self): page_num = 1 self.driver.get(self.url) while True: source = self.driver.page_source WebDriverWait(driver=self.driver, timeout=300).until( EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]")) ) self.parse_list_page(source, page_num) next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]") if 'pager_next pager_next_disabled' in next_btn.get_attribute('class'): print("已是最后一页了") break else: next_btn.click() page_num += 1 time.sleep(random.randint(1, 3)) def parse_list_page(self, source, page_num): html = etree.HTML(source) links = html.xpath("//a[@class='position_link']/@href") postion_num = 1 for link in links: print("正在解析第% s页的第%s条数据" % (page_num, postion_num)) self.request_detail_page(link) time.sleep(random.randint(1, 3)) postion_num += 1 def request_detail_page(self, url): self.driver.execute_script("window.open('%s')" % url) self.driver.switch_to.window(self.driver.window_handles[1]) WebDriverWait(self.driver, timeout=300).until( EC.presence_of_element_located((By.XPATH, "//h2[@class='name']")) ) source = self.driver.page_source self.parse_detail_page(source) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) def parse_detail_page(self, source): html = etree.HTML(source) postion_name = html.xpath("//h2[@class = 'name']/text()")[0] job_request_spans = html.xpath("//h3/span") salary = job_request_spans[0].xpath(".//text()")[0] city = job_request_spans[1].xpath(".//text()")[0] city = re.sub(r"[\s/]", "", city) work_years = job_request_spans[2].xpath(".//text()")[0] work_years = re.sub(r"[\s/]", "", work_years) education = job_request_spans[3].xpath(".//text()")[0] education = re.sub(r"[\s/]", "", education) job_desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() job_desc = re.sub(r"\n", "", job_desc) company = html.xpath("//em[@class='fl-cn']/text()")[0].strip() postion = { "company": company, "postion_name": postion_name, "salary":salary, "city": city, "work_years": work_years, "education": education, "job_desc": job_desc } print(postion) print('-' * 50) self.positions.append(postion) def main(): lagou = LagouSpider() lagou.run() if __name__ == '__main__': main()