urllib案例实战二：boss职位信息简单获取（re模块使用）

最新推荐文章于 2024-04-15 21:48:59 发布

总想转行

最新推荐文章于 2024-04-15 21:48:59 发布

阅读量131

点赞数

分类专栏： python实战 urllib

本文链接：https://blog.csdn.net/qq_46020608/article/details/113120642

版权

python实战同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

urllib

4 篇文章 0 订阅

订阅专栏

import re
from urllib import request,error
import time

if __name__ == '__main__':
    url="https://www.zhipin.com/job_detail/?query=%E9%85%8D%E7%BD%AE%E7%AE%A1%E7%90%86&city=101270100&industry=&position="
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
    html=''
    try:
        req=request.Request(url,headers=headers)
        resp=request.urlopen(req)
        #time.sleep(4)
        #html=resp.read().decode("utf-8")
        #print(html)
    except Exception as e:
        if hasattr(e,'code'):
            print("HTTPError "+e.code+"  "+e.reason)
        elif hasattr(e,'reason'):
            print("URLError  "+e.reason)

    #boss
    company_total={}
    htmls=open('a.html','rb').read().decode('utf-8')

    pat='<h3 class="name"><a href="(.*?)" title=".*?" ka=".*?" target="_blank">(.*?)</a></h3>'
   # print('2222')
    company=re.findall(pat,htmls)
    #print(len(company))
    for i in company:
        #print(i[1]+" https://www.zhipin.com"+i[0])
        print("companyinfo:https://www.zhipin.com"+i[0]+"   companyname:"+i[1])
        company_total[i[1]]={"companyinfo":"https://www.zhipin.com"+i[0]}
    print(company_total)

    pat='<p><a href=".*?" class="false-link" target="_blank" ka=".*?" title=".*?">.*?</a><em class="vline"></em>(.*?)<em class="vline"></em>(.*?)</p>'
    company = re.findall(pat, htmls)
    for i in company:
        #print(i[1]+" https://www.zhipin.com"+i[0])
        print("是否上市:"+i[0]+"  公司人数:"+i[1])
        #company_total[i[0]]["isup"]="https://www.zhipin.com"+i[0]
        #company_total[i[0]]["people"] =i[1]
    print(company_total)
    pat='<span class="red">(.*?)</span>'
    company = re.findall(pat, htmls)
    for i in company:
        #print(i[1]+" https://www.zhipin.com"+i[0])
        print("xinzhi:"+i)
        #company_total[i[0]]["xinzhi"]=i[0]
    pat='^<p>(.*?)<em class="vline"></em>(.*?)</p>'
    company = re.findall(pat, htmls)
    for i in company:
        print("工作年限:" + i[0]+"  学历要求："+i[1])

    pat='<span class="job-area">(.*?)</span>'
    company = re.findall(pat, htmls)
    for i in company:
        print("工作地点:" + i)

    for key in company_total:
        print("公司名："+key)