selenium 爬取拉勾网

from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
    driver_path = r"/home/charging/chromedriver"

    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
        self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
        self.positions = []

    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source   # 解析当前这一页
            WebDriverWait(driver=self.driver,timeout=10).until(EC.presence_of_all_elements_located((By.XPATH,
                                                            "//div[@class='pager_container']/span[@action='next']")))
            self.parse_list_page(source)
            # 找到当前这页中的'下一页'
            next_button = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[@action='next']")
            # 判断是否为最后一页
            if 'pager_next pager_next_disabled' in next_button.get_attribute("class"):
                break
            else:
                # next_button.click()   # 该方法不可行
                self.driver.execute_script("arguments[0].click();", next_button)
            time.sleep(1)  # 避免爬取过快,爬取完每一页
    # 所有职位信息的连接列表
    def parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:   # 获取每个职位链接的集合
            self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        # self.driver.get(url)
        self.driver.execute_script("window.open('%s')"%url)
        self.driver.switch_to_window(self.driver.window_handles[1])
        WebDriverWait(self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/h1[@class='name']")))
        source = self.driver.page_source   # 获取网页源码
        self.parse_detail_page(source)
        self.driver.close()     # 关闭当前详情窗口
        self.driver.switch_to_window(self.driver.window_handles[0])  # 切换到职位列表页
    # 获取详细信息
    def parse_detail_page(self, source):
        html = etree.HTML(source)   # 上一个方法 requests,传入response。text
        position_name = html.xpath
        # 获取职位名称
        position_name = html.xpath("//h1[@class='name']//text()")[0]
        # 获取职位地址/薪资/经验...
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        # 薪资
        salary = job_request_spans[0].xpath('.//text()')[0].strip()  # 取第一个元素后还需要解析 strip()取出字符串中空白
        # 城市
        city = job_request_spans[1].xpath('.//text()')[0].strip()  # /深圳 / 为了取出反斜杠和空格,正则表达式
        city = re.sub(r"[\s/]", "", city)  # 将空白符(空格/制表符)替换成空字符串
        # 工作年限
        work_years = job_request_spans[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r"[\s/]", "", work_years)
        # 学历
        education = job_request_spans[3].xpath('.//text()')[0].strip()
        education = re.sub(r"[\s/]", "", education)
        # 公司
        company = html.xpath("//h3[@class='fl']//em/text()")[0].strip()
        # 职位描述
        job_desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()  # 返回的是一个列表,用join组合成一个字符串,进行规则排序

        # 将每个职位信息整合成一个字典
        position = {
            'name': position_name,
            'company': company,
            'salary': salary,
            'city': city,
            'work_year': work_years,
            'education': education,
            'job_desc': job_desc
        }
        # 将所有的字典追加到列表
        self.positions.append(position)
        print(position)
        print('-'*20)

def main():
    s = LagouSpider()
    s.run()


if __name__ == '__main__':
    main()
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
WebMagic是Java语言编写的一款爬虫框架,它基于Jsoup解析器实现,可以轻松地爬取各种站上的数据。下面是使用WebMagic爬取拉勾招聘数据的方法: 1. 首先,需要导入WebMagic相关的依赖包,可以在pom.xml文件中添加以下代码: ``` <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-selenium</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-firefox-driver</artifactId> <version>2.53.1</version> </dependency> ``` 2. 创建一个Java类,用于定义需要爬取的数据结构。例如,我们可以定义一个Job类,用于存储招聘信息: ``` public class Job { private String name; //职位名称 private String salary; //薪资范围 private String company; //公司名称 private String location; //工作地点 private String experience; //工作经验要求 private String education; //学历要求 private String label; //职位标签 //省略setter和getter方法 } ``` 3. 创建一个实现PageProcessor接口的Java类,用于定义爬虫的逻辑。例如,我们可以定义一个LaGouProcessor类,用于爬取拉勾的招聘信息: ``` public class LaGouProcessor implements PageProcessor { private Site site = Site.me() .setRetryTimes(3) //重试次数 .setSleepTime(1000) //访问间隔 .setUserAgent(UserAgentUtil.getRandomUserAgent()); //随机User-Agent @Override public void process(Page page) { List<Job> jobList = new ArrayList<>(); List<String> jobUrls = page.getHtml().links().regex("https://www.lagou.com/jobs/\\d+.html").all(); for (String jobUrl : jobUrls) { page.addTargetRequest(jobUrl); //添加详情页链接到抓取队列 } if (page.getUrl().regex("https://www.lagou.com/jobs/\\d+.html").match()) { //详情页 Job job = new Job(); job.setName(page.getHtml().xpath("//div[@class='job-name']/span/text()").get()); job.setSalary(page.getHtml().xpath("//dd[@class='job_request']/h3/span[1]/text()").get()); job.setCompany(page.getHtml().xpath("//div[@class='company']/text()").get()); job.setLocation(page.getHtml().xpath("//dd[@class='job_request']/p/span[2]/text()").get()); job.setExperience(page.getHtml().xpath("//dd[@class='job_request']/p/span[3]/text()").get()); job.setEducation(page.getHtml().xpath("//dd[@class='job_request']/p/span[4]/text()").get()); job.setLabel(StringUtils.join(page.getHtml().xpath("//dd[@class='job_request']/ul/li/span/text()") .all(), ",")); //多个标签以逗号分隔 jobList.add(job); } page.putField("jobList", jobList); } @Override public Site getSite() { return site; } } ``` 4. 在main方法中,创建一个Spider对象,并设置需要爬取的初始链接、定义的PageProcessor对象和线程数等参数。例如,我们可以设置以下参数: ``` public static void main(String[] args) { Spider.create(new LaGouProcessor()) .addUrl("https://www.lagou.com/zhaopin/Java/?labelWords=label") .thread(5) .run(); } ``` 5. 运行程序,即可爬取拉勾的招聘信息。可以通过page.putField方法将爬取到的数据存储到Map中,然后可以将Map写入文件或者数据库中。 ``` public static void main(String[] args) { Spider.create(new LaGouProcessor()) .addUrl("https://www.lagou.com/zhaopin/Java/?labelWords=label") .thread(5) .run(); } ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值