urllib案例实战二:boss职位信息简单获取(re模块使用)

import re
from urllib import request,error
import time

if __name__ == '__main__':
    url="https://www.zhipin.com/job_detail/?query=%E9%85%8D%E7%BD%AE%E7%AE%A1%E7%90%86&city=101270100&industry=&position="
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
    html=''
    try:
        req=request.Request(url,headers=headers)
        resp=request.urlopen(req)
        #time.sleep(4)
        #html=resp.read().decode("utf-8")
        #print(html)
    except Exception as e:
        if hasattr(e,'code'):
            print("HTTPError "+e.code+"  "+e.reason)
        elif hasattr(e,'reason'):
            print("URLError  "+e.reason)

    #boss
    company_total={}
    htmls=open('a.html','rb').read().decode('utf-8')

    pat='<h3 class="name"><a href="(.*?)" title=".*?" ka=".*?" target="_blank">(.*?)</a></h3>'
   # print('2222')
    company=re.findall(pat,htmls)
    #print(len(company))
    for i in company:
        #print(i[1]+" https://www.zhipin.com"+i[0])
        print("companyinfo:https://www.zhipin.com"+i[0]+"   companyname:"+i[1])
        company_total[i[1]]={"companyinfo":"https://www.zhipin.com"+i[0]}
    print(company_total)

    pat='<p><a href=".*?" class="false-link" target="_blank" ka=".*?" title=".*?">.*?</a><em class="vline"></em>(.*?)<em class="vline"></em>(.*?)</p>'
    company = re.findall(pat, htmls)
    for i in company:
        #print(i[1]+" https://www.zhipin.com"+i[0])
        print("是否上市:"+i[0]+"  公司人数:"+i[1])
        #company_total[i[0]]["isup"]="https://www.zhipin.com"+i[0]
        #company_total[i[0]]["people"] =i[1]
    print(company_total)
    pat='<span class="red">(.*?)</span>'
    company = re.findall(pat, htmls)
    for i in company:
        #print(i[1]+" https://www.zhipin.com"+i[0])
        print("xinzhi:"+i)
        #company_total[i[0]]["xinzhi"]=i[0]
    pat='^<p>(.*?)<em class="vline"></em>(.*?)</p>'
    company = re.findall(pat, htmls)
    for i in company:
        print("工作年限:" + i[0]+"  学历要求:"+i[1])

    pat='<span class="job-area">(.*?)</span>'
    company = re.findall(pat, htmls)
    for i in company:
        print("工作地点:" + i)

    for key in company_total:
        print("公司名:"+key)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值