python爬取职位信息_Python 使用selenium爬取拉钩网Python职位信息(爬虫)

[Python] 纯文本查看 复制代码class LaGoSpider(object):

'''

封装为一个类,方便操作

'''

def __init__(self):

options = webdriver.ChromeOptions()

options.add_argument('--headless')

options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

self.driver = webdriver.Chrome(r'D:\外安装软件\selenium1\chromedriver_win32\chromedriver.exe', options=options)

self.data_list = []

def address_url(self):

'''

获取目标url(拼接)

'''

self.citys = ['全国', '北京', '深圳', '广州', '杭州', '成都', '南京', '上海', '厦门', '西安', '长沙']

self.baseurl = 'https://www.lagou.com/jobs/list_python?px=default&city={}'

for self.city in self.citys:

self.url = self.baseurl.format(quote(self.city))

self.driver.get(self.url)

print('正在爬取<%s>' % self.city)

while True:

source = self.driver.page_source

self.position_url_parse(source)

next_page = self.driver.find_element_by_xpath('//span[@class="pager_next "]')

if 'contains(class, "pager_next")' in next_page.get_attribute('class'): # 判断一页是否爬取完成

print('<%s爬取完毕>' % self.city)

break

else:

self.driver.execute_script("arguments[0].click()", next_page)

print('----------------爬取下一页--------------')

time.sleep(random.randint(3, 5))

def position_url_parse(self, source):

'''

获取每个职位的url

'''

html = etree.HTML(source)

lis = html.xpath('//ul[@class="item_con_list"]//li')

for li in lis:

position_url = li.xpath('.//a[@class="position_link"]//@href')[0]

self.request_urls(position_url)

time.sleep(random.randint(1, 3))

def request_urls(self, list_url):

self.driver.execute_script('window.open("%s")' % list_url)

self.driver.switch_to_window(self.driver.window_handles[1])

source = self.driver.page_source

self.parse_position(source)

time.sleep(random.randint(1, 3))

self.driver.close()

self.driver.switch_to_window(self.driver.window_handles[0])

time.sleep(random.randint(1, 3))

def parse_position(self, source):

'''

抓取每个职位的详情信息

'''

self.data = {}

html = etree.HTML(source)

company = html.xpath('//dl[@class="job_company"]/dt/a/img/@alt')[0]

print(company)

self.data['公司'] = company

name = html.xpath('//div[@class="position-content-l"]//span[@class="name"]/text()')[0]

self.data['名称'] = name

salary = html.xpath('//dd[@class="job_request"]/p[1]/span[1][@class="salary"]/text()')[0]

self.data['薪资'] = salary

city = ''.join(html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0]).replace('/','')

self.data['城市'] = city

jinyan = ''.join(html.xpath('//dd[@class="job_request"]/p[1]/span[3]/text()')[0]).replace('/', '')

self.data['经验'] = jinyan

xueli = ''.join(html.xpath('//dd[@class="job_request"]/p[1]/span[4]/text()')[0]).replace('/','')

self.data['学历'] = xueli

zhihuo = html.xpath('//*[@id="job_detail"]/dd[1]/p/text()')[0]

self.data['职位诱惑'] = zhihuo

zhimiao = ''.join(html.xpath('//div[@class="job-detail"]//p//text()')).replace('岗位职责: ', '').replace('岗位要求:', '').replace('岗位职责:', '').replace('工作职责:', '').replace('项目背景:', '').replace('-', '').strip()

self.data['职位描述'] = zhimiao

self.data_list.append(self.data)

self.csv_()

def csv_(self):

'''

保存数据为csv

'''

header = ['公司', '名称', '薪资', '城市', '经验', '学历', '职位诱惑', '职位描述']

with open('lagou_quanguo.csv', 'w', encoding='utf-8', newline='')as fb:

writer = csv.DictWriter(fb, header)

writer.writeheader()

writer.writerows(self.data_list)

if __name__ == '__main__':

LG = LaGoSpider()

LG.address_url()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值