selenium爬取拉勾网
**
因为是用selenium爬取的,我就不多说了
想说的都是代码里面的注释。
新手上路,不懂的还希望各位大佬多多给建议。
from selenium import webdriver
import time
from lxml import etree
import re
import pymysql
class LagouSpider(object):
driver_path = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
def __init__(self):
self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
# 请求页面的url
self.url = 'https://www.lagou.com/zhaopin/Python/?labelWords=label'
# 请求职位页面
def run(self):
# 请求职位页面
self.driver.get(self.url)
# 关闭广告弹窗
self.driver.find_element_by_xpath('/html/body/div[8]/div/div[2]').click()
time.sleep(2)
for page in range(2):
# page_source获取html页面
source = self.driver.page_source
self.parse_list_page(source)
time.sleep(1)
# 执行js下拉操作
js = 'window.scrollBy(0,2500)'
self.driver.execute_script(js)
time.sleep(2)
# 点击下一页按钮
net_page = self.driver.find_element_by_partial_link_text('下一页')
net_page.click()
time.sleep(5)
# 获取取详情页url
def parse_list_page(self, source):
html = etree.HTML(source)
for page in range(1):
# 提取详情页rul
link = html.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/@href')
for links in link:
self.request_detail_page(links)
# 求情详情页和详情页和列表页的切换
def request_detail_page(self, url):
# 打开一个新的页面
self.driver.execute_script('window.open("%s")'%url)
# 最多打开两页(职位列表页和职位详情页)
self.driver.switch_to.window(self.driver.window_handles[1])
time.sleep(2)
# 拿取详情页url
source = self.driver.page_source
self.parse_detail_page(source)
# 关闭职位详情页
time.sleep(2)
self.driver.close()
# 切换到列表页
self.driver.switch_to.window(self.driver.window_handles[0])
# 提取数据
def parse_detail_page(self,souce):
html = etree.HTML(souce)
# 工作名称
job_titles = html.xpath('//div[@class="job-name"]/h1/text()')
job_titless = "".join(job_titles)
job_title = re.sub('['' . / , //]', '', job_titless)
print(job_title)
# 工资
salarys = html.xpath('//dd[@class="job_request"]/h3/span[1]/text()')
salaryss = "".join(salarys)
salary = re.sub('['' . / , //]', '', salaryss)
# 工作地点
workplaces = html.xpath('//dd[@class="job_request"]/h3/span[2]/text()')
workplacess = "".join(workplaces)
workplace = re.sub('['' . / , //]', '', workplacess)
# 工作经验
work_experiences = html.xpath('//dd[@class="job_request"]/h3/span[3]/text()')
work_experiencess = "".join(work_experiences)
work_experience = re.sub('['' . / , //]', '', work_experiencess)
# 学历
education_backgrounds = html.xpath('//dd[@class="job_request"]/h3/span[4]/text()')
education_backgroundss = "".join(education_backgrounds)
education_background = re.sub('['' . / , //]', '', education_backgroundss)
# 公司
companys = html.xpath('//div[@class="content_r"]/dl/dt/a/div/h3/em/text()')
companyss = "".join(companys)
company = re.sub('['' . / , \n]', '', companyss)
# 第一个保存的方法(保存为字典的格式),存入txt文档
# 定义一个字典,把数据写入到字典中去
# info = {}
# info['工作名称'] = job_title
# print(info)
# info['工资'] = salary
# info['工作地点'] = workplace
# info['工作经验'] = work_experience
# info['学历'] = education_background
# info['公司'] = company
# with open('job_data.txt','a',encoding='utf-8') as f:
# f.write(info)
# 第二个保存发发(写入MySQL数据库)
conn = pymysql.connect(
user='root',
passwd='root',
# lianxi这个数据是我事先创建好的
database='lianxi',
charset='utf8')
print('已链接上MySQL')
cursor = conn.cursor()
# 这里的lagou_job表也是我写入之前创建好的
sql = ("INSERT INTO lagou_job (岗位,工资,工作地点,工作经验,学历,公司)" "VALUES(%s,%s,%s,%s,%s,%s)")
data_sql = (job_title, salary, workplace, work_experience, education_background, company)
# 写入MySQL数据库的异常处理。
try:
print("数据开始写入..............")
cursor.execute(sql, data_sql)
conn.commit()
print('写入成功')
except:
print("写入数据失败")
conn.rollback()
conn.close()
if __name__ == '__main__':
spider = LagouSpider()
spider.run()