实战-selenium完美实现拉勾信息爬取-1(Requsets)

Requests页面解析(练习)

点击查看Selenium实现拉勾网信息爬取

import requests
from lxml import etree
from h_selenium.lagou_headers import HEADERS
import re
from time import sleep
from random import randint


def request_list_url():
    url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
    data = {'first': 'false', 'pn': '1', 'kd': 'python'}
    resp = requests.post(url=url, data=data, headers=HEADERS)
    result_dict = resp.json()
    # print(result)
    positions_ids = result_dict['content']['positionResult']['result']
    id_list = [i['positionId'] for i in positions_ids]
    return id_list


def positions_html(url_id):
    url = 'https://www.lagou.com/jobs/{}.html'.format(url_id)
    resp = requests.get(url=url, headers=HEADERS)
    text = resp.text
    html = etree.HTML(text)
    # print(text)
    return html


def data_massage(html):

    def format_tool(_str):
        format_s = re.sub(r'[\s /]', '', _str).strip()
        return format_s

    title = html.cssselect('span.name')[0].text  # 岗位名称
    describe = html.cssselect('dd.job_request p span')
    describe = [format_tool(i.text) for i in describe]  # 岗位描述
    salary, city, job_years, edu, = describe[:-1]
    print(title)
    print(salary, city, job_years, edu)
    print('-'*30)
    advantage = html.cssselect('dd.job-advantage p')[0].text  # 职位诱惑
    description_li = html.xpath('//dd[@class="job_bt"]//div//p/text()')  # 岗位描述
    list_format = ''
    description_no_format = list_format.join(description_li)
    description = format_tool(description_no_format)
    address_li = html.xpath('//div[@class="work_addr"]//text()')[:-2]  # 工作地址
    address_no_format = list_format.join(address_li)
    address = format_tool(address_no_format)
    print(advantage,)
    print('-'*30)
    print(description)
    print('-'*30)
    print(address)
    print('='*60+'\n')


def main():
    id_list = request_list_url()
    for url_id in id_list:
        html = positions_html(url_id)
        data_massage(html)
        # break
        sleep(randint(3, 9))


if __name__ == '__main__':
    main()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

pylemon

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值