简易爬虫，selenium的webdriver爬取上海python岗位信息

最新推荐文章于 2022-05-29 15:51:18 发布

baibai_-

最新推荐文章于 2022-05-29 15:51:18 发布

阅读量251

点赞数 2

本文链接：https://blog.csdn.net/qq_42952532/article/details/84872254

版权

python实现
selenium模拟人的行为使用谷歌浏览器打开拉勾网，输入python并点击上海按钮，爬取前25页简易数据，存储在txt文本里，（新手，请见谅）
（2018/12/19更新实现保存在数据库里）

import time
from selenium import webdriver
from scrapy.selector import Selector
from selenium.webdriver.support.wait import WebDriverWait

# 浏览器路径，在网上下载的chromedriver
web = webdriver.Chrome(executable_path='D:/Users/bai_scrapy/chromedriver_win32/chromedriver.exe')

# 浏览器自动打开，网址为拉勾网
web.get('https://www.lagou.com/zhaopin/')
# css选择器选择输入框
web_search = web.find_element_by_css_selector('#keyword')
# 在输入框输入python
web_search.send_keys('python')
WebDriverWait(web,1.5)
# 这里选取的是 上海 按钮
web_search_button = web.find_element_by_css_selector('.city-wrapper.dn a:nth-child(2)')
# 点击按钮
web_search_button.click()
WebDriverWait(web,1.5)

# 爬取前25页数据
for i in range(1,26):
    print('正在爬取第%s页数据'%i)
    # 每一页有15条招聘信息，j是为了第几个li标签
    for j in range(1,16):
        time.sleep(0.05)
        # 获取岗位名
        web_job = web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) a h3'%j).text
        # 获取岗位地区 如浦东新区
        web_job_area =web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) .add em'%j).text
        # 获取公司名
        web_job_company = web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) .company_name a' % j).text
        # 获取经验 薪水 学历等信息
        web_job_suffer = web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) .p_bot div' % j).text
		# 打开txt文件
        with open('lagou_job111.txt','ab+') as f:
			# 在一行内写入一个岗位的几个信息，中间用-.-间隔
            f.write(('\r' + web_job + '-.-' + web_job_area + '-.-' + web_job_company + '-.-' + web_job_suffer + '\r\n').encode('utf-8'))
	# 获取下一页按钮 这里用到css选择器 last-child方法
    next_page_button = web.find_element_by_css_selector('.pager_container span:last-child')
    # 点击下一页
    next_page_button.click()
    time.sleep(1.5)

在这里插入图片描述
成功后txt文本内的内容如图，下一步是txt写入csv，进行下一步数据分析，或者可以写入数据时直接写入csv文件，暂时写到这里。

↓2018/12/19↓更新

# -*- coding: utf-8 -*-
import time
import pymysql
from selenium import webdriver
from scrapy.selector import Selector
from selenium.webdriver.support.wait import WebDriverWait


# 谷歌浏览器 在网上下载的chromdriver
web = webdriver.Chrome(executable_path='D:/Users/bai_scrapy/chromedriver_win32/chromedriver.exe')
# 连接数据库，ip，端口，数据库名database，账户密码，编码格式
bai_conn = pymysql.connect(host="你的ip", port=3306, db="你的db名",
                          user="你的账号", passwd="你的密码", charset="utf8")
# 开启
bai_cursor = bai_conn.cursor()
time.sleep(0.1)

# 打开拉勾网
web.get('https://www.lagou.com/zhaopin/')
# css选择器 选择输入框
web_search = web.find_element_by_css_selector('#keyword')
# 输入框输入python
web_search.send_keys('python')
WebDriverWait(web,1.5)
# 选择 上海 按钮
web_search_button = web.find_element_by_css_selector('.city-wrapper.dn a:nth-child(2)')
# 点击 上海
web_search_button.click()
WebDriverWait(web,1.5)

# 爬取前25页数据
try:
    for i in range(1,26):
        print('正在爬取第%s页数据'%i)


        # j 用来选择第几个li标签
        for j in range(1,16):
            time.sleep(0.05)
            # 提取岗位名字
            web_job = web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) a h3'%j).text
            # 提取岗位地区 如浦东新区
            web_job_area =web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) .add em'%j).text
            # 提取公司名字
            web_job_company = web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) .company_name a' % j).text
            # 提取 经验 薪水 学历 要求
            web_job_suffer = web.find_element_by_css_selector('#s_position_list ul li:nth-child(%s) .p_bot div' % j).text

            # # 打开txt文件
            # with open('lagou_job5678.txt','ab+') as f:
            #     # 一条招聘信息占一行 中间用 -.-间隔
            #     f.write(('\r' + web_job + '-.-' + web_job_area + '-.-' + web_job_company + '-.-' + web_job_suffer + '\r\n').encode('utf-8'))
            
            # sql语句
            sql_str = ('insert into python_job_lagou(web_job,web_job_area,web_job_company,web_job_suffer) '
                       'values(%s,%s,%s,%s)')
            value_str = (web_job, web_job_area, web_job_company, web_job_suffer)
            bai_cursor.execute(sql_str,value_str) # 执行
            bai_conn.commit()  # 执行

        # 获取下一页按钮 这里用到了 css选择器的last-child方法
        next_page_button = web.find_element_by_css_selector('.pager_container span:last-child')
        # 点击按钮
        next_page_button.click()
        time.sleep(1.5)
    bai_cursor.close()  # 关闭事务
    bai_conn.close()  # 关闭数据库
except Exception as log:
    print('log',log)
finally:
    print("程序执行完毕-------------------------------")

在这里插入图片描述
数据库内容如图所示

baibai_-

关注

2
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
简易爬虫，selenium的webdriver爬取上海python岗位信息

python实现selenium模拟人的行为使用谷歌浏览器打开拉勾网，输入python并点击上海按钮，爬取前25页简易数据，存储在txt文本里，（新手，请见谅）import timefrom selenium import webdriverfrom scrapy.selector import Selectorfrom selenium.webdriver.support.wait i...
复制链接

扫一扫