selenium爬取lagou职位

最新推荐文章于 2021-11-13 11:01:10 发布

请说你好

最新推荐文章于 2021-11-13 11:01:10 发布

阅读量228

点赞数 1

文章标签： selenium mysql xpath python

本文链接：https://blog.csdn.net/qq_41227447/article/details/104501456

版权

selenium爬取拉勾网

**
因为是用selenium爬取的，我就不多说了
想说的都是代码里面的注释。
新手上路，不懂的还希望各位大佬多多给建议。

from selenium import webdriver
import time
from lxml import etree
import re
import pymysql
class LagouSpider(object):
    driver_path = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
        # 请求页面的url
        self.url = 'https://www.lagou.com/zhaopin/Python/?labelWords=label'

    # 请求职位页面
    def run(self):
        # 请求职位页面
        self.driver.get(self.url)
        # 关闭广告弹窗
        self.driver.find_element_by_xpath('/html/body/div[8]/div/div[2]').click()
        time.sleep(2)
        for page in range(2):
            # page_source获取html页面
            source = self.driver.page_source
            self.parse_list_page(source)
            time.sleep(1)
            # 执行js下拉操作
            js = 'window.scrollBy(0,2500)'
            self.driver.execute_script(js)
            time.sleep(2)
            # 点击下一页按钮
            net_page = self.driver.find_element_by_partial_link_text('下一页')
            net_page.click()
            time.sleep(5)
    # 获取取详情页url
    def parse_list_page(self, source):
        html = etree.HTML(source)
        for page in range(1):
            # 提取详情页rul
            link = html.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/@href')
            for links in link:
                self.request_detail_page(links)

    # 求情详情页和详情页和列表页的切换
    def request_detail_page(self, url):
        # 打开一个新的页面
        self.driver.execute_script('window.open("%s")'%url)
        # 最多打开两页（职位列表页和职位详情页）
        self.driver.switch_to.window(self.driver.window_handles[1])
        time.sleep(2)
        # 拿取详情页url
        source = self.driver.page_source
        self.parse_detail_page(source)
        # 关闭职位详情页
        time.sleep(2)
        self.driver.close()
        # 切换到列表页
        self.driver.switch_to.window(self.driver.window_handles[0])
    # 提取数据
    def parse_detail_page(self,souce):
        html = etree.HTML(souce)
        # 工作名称
        job_titles = html.xpath('//div[@class="job-name"]/h1/text()')
        job_titless = "".join(job_titles)
        job_title = re.sub('['' . / , //]', '', job_titless)
        print(job_title)
        # 工资
        salarys = html.xpath('//dd[@class="job_request"]/h3/span[1]/text()')
        salaryss = "".join(salarys)
        salary = re.sub('['' . / , //]', '', salaryss)
        # 工作地点
        workplaces = html.xpath('//dd[@class="job_request"]/h3/span[2]/text()')
        workplacess = "".join(workplaces)
        workplace = re.sub('['' . / , //]', '', workplacess)
        # 工作经验
        work_experiences = html.xpath('//dd[@class="job_request"]/h3/span[3]/text()')
        work_experiencess = "".join(work_experiences)
        work_experience = re.sub('['' . / , //]', '', work_experiencess)
        # 学历
        education_backgrounds = html.xpath('//dd[@class="job_request"]/h3/span[4]/text()')
        education_backgroundss = "".join(education_backgrounds)
        education_background = re.sub('['' . / , //]', '', education_backgroundss)
        # 公司
        companys = html.xpath('//div[@class="content_r"]/dl/dt/a/div/h3/em/text()')
        companyss = "".join(companys)
        company = re.sub('['' . / , \n]', '', companyss)

# 第一个保存的方法（保存为字典的格式），存入txt文档
        # 定义一个字典，把数据写入到字典中去
        # info = {}
        # info['工作名称'] = job_title
        # print(info)
        # info['工资'] = salary
        # info['工作地点'] = workplace
        # info['工作经验'] = work_experience
        # info['学历'] = education_background
        # info['公司'] = company
        # with open('job_data.txt','a',encoding='utf-8') as f:
        #     f.write(info)
# 第二个保存发发(写入MySQL数据库)
        conn = pymysql.connect(
            user='root',
            passwd='root',
            # lianxi这个数据是我事先创建好的
            database='lianxi',
            charset='utf8')
        print('已链接上MySQL')
        cursor = conn.cursor()
        #  这里的lagou_job表也是我写入之前创建好的
        sql = ("INSERT INTO lagou_job (岗位,工资,工作地点,工作经验,学历,公司)" "VALUES(%s,%s,%s,%s,%s,%s)")
        data_sql = (job_title, salary, workplace, work_experience, education_background, company)
        #  写入MySQL数据库的异常处理。
        try:
            print("数据开始写入..............")
            cursor.execute(sql, data_sql)
            conn.commit()
            print('写入成功')
        except:
            print("写入数据失败")
            conn.rollback()
            conn.close()
if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()

请说你好

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
selenium爬取lagou职位

selenium爬取拉勾网**因为是用selenium爬取的，我就不多说了想说的都是代码里面的注释。新手上路，不懂的还希望各位大佬多多给建议。from selenium import webdriverimport timefrom lxml import etreeimport reimport pymysqlclass LagouSpider(object): ...
复制链接

扫一扫