Python使用正则爬取51job

为了更快捷,使用多线程

import requests
import re


def get_request(page):
    url = "https://search.51job.com/list/190200,000000,0000,00,9,99,%25E5%25B0%258F%25E7%25A8%258B%25E5%25BA%258F,2,"+str(page)+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    headers = {
        "User-Agent":"头部信息"
    }
    res = requests.get(url,headers=headers)
    res.encoding = res.apparent_encoding
    return res.text

def get_content(respon):
    model = re.compile(r'<div class="el">.*?<a.*?href="(.*?)".*?onmousedown="">(.*?)</a>.*?<span class="t2">.*?">(.*?)</a>.*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S)
    values = re.findall(model,respon)
    for value in range(0,len(values)):
        #print(values[value])
        link = values[value][0]
        position = values[value][1].strip()
        company = values[value][2]
        address = values[value][3]
        wage = values[value][4]
        updataTime = values[value][5]
        print("职位链接:",link,"职位名:",position,"公司名:",company,"工作地址:",address,"工资:",wage,"发布时间:",updataTime)

def main(page):
    try:
        respon = get_request(page)
        get_content(respon)
        print("*"*50,"\n\n")
        print("第",page,"页爬取成功","\n\n")
        print("*"*50)
    except Exception as e:
        print("*"*50,"\n\n")
        print("第",page,"页爬取失败","\n\n")
        print("*"*50)

if __name__ == '__main__':
    for page in range(1,8):
        main(page)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值