爬虫实战——使用普通方法爬取拉勾网（1）

最新推荐文章于 2020-08-17 22:33:58 发布

yebulk

最新推荐文章于 2020-08-17 22:33:58 发布

阅读量738

点赞数

分类专栏： python 爬虫

本文链接：https://blog.csdn.net/devilangel2/article/details/105621849

版权

爬虫同时被 2 个专栏收录

18 篇文章 1 订阅

订阅专栏

python

17 篇文章 0 订阅

订阅专栏

import requests
import time
from lxml import etree
import re
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.360',
             'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
             'Accept': 'application/json, text/javascript, */*; q=0.01'
    }

def get_url():
    url_start = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
    url_parse = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"


    #cookies={'Cookie':'user_trace_token=20200329002625-f9f5792d-941e-4091-a525-338455cb678e; LGUID=20200329002625-e426697a-104e-472a-acec-57d9772e6995; _ga=GA1.2.274064110.1585412785; index_location_city=%E5%85%A8%E5%9B%BD; lagou_utm_source=B; JSESSIONID=ABAAABAABAGABFA54850B9CC64906DCA2818FB1CD32FEF6; WEBTJ-ID=20200418190102-1718cf2ed0f1fe-04d7de2e43c295-4313f6a-1049088-1718cf2ed101e1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1585558987,1586095974,1586352255,1587207663; _gid=GA1.2.1575314079.1587207663; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221712aae307614c-00c6951830c8d1-4313f6a-1049088-1712aae30778b8%22%2C%22%24device_id%22%3A%221712aae307614c-00c6951830c8d1-4313f6a-1049088-1712aae30778b8%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; gate_login_token=343f8443a90a85100c4dd44f03ef5ee978e1e1a91bf08f3167123728625752dc; _putrc=419FC39D9FD9C5C8123F89F2B170EADC; login=true; unick=%E7%94%A8%E6%88%B78912; privacyPolicyPopup=false; hasDeliver=0; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20200418202430-aa3c94cd-6652-4486-b0be-c8a02691194c; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=6ea807640c080d2f0004127851ecae8f8421ef7ea9; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1587213999; LGRID=20200418204641-9930ae6d-523c-4995-878b-621db461aa8f; SEARCH_ID=d1325576a2714673a0a7101a03bd69ee'}
    #rl='https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
    data={'first':'true',
          'pn':'1',
          'kd':'python'}
    for i in range(1,5):
        data['pn']=str(i)
        s = requests.Session()
        s.get(url_start, headers=headers, timeout=3)  # 请求首页获取cookies
        cookie = s.cookies  # 为此次获取的cookies
        response = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3)  # 获取此次文本
        time.sleep(3)
        result=response.json()
        positions=result['content']['positionResult']['result']
        for position in positions:
            id=position['positionId']
            url='https://www.lagou.com/jobs/%d.html?show=d9ba2bf5a5c14b838db35a714a40f123'%id
            parse_url(url)
            break
        break


def parse_url(url):
    #print(url)
    response=requests.get(url,headers=headers)
    text =response.text
    #print(text)
    html=etree.HTML(text)
    position_name=html.xpath("//h1[@class='name']/text()")[0]
    job_request_spans=html.xpath("//dd[@class='job_request']//span")
    salary=job_request_spans[0].xpath('.//text()')[0].strip()
    city=job_request_spans[1].xpath('.//text()')[0].strip()
    city=re.sub(r"[\s/]",'',city)
    experience = job_request_spans[2].xpath('.//text()')[0].strip()
    experience = re.sub(r"[\s/]", '', experience)
    education = job_request_spans[3].xpath('.//text()')[0].strip()
    education = re.sub(r"[\s/]", '', education)
    desc="".join(html.xpath("//dd[@class='job_bt']//text()")).strip()

    print(desc)




def main():
    pass

if __name__=='__main__':
    get_url()

yebulk

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
爬虫实战——使用普通方法爬取拉勾网（1）

import requestsimport timefrom lxml import etreeimport reheaders={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36...
复制链接

扫一扫