import re
from urllib import request,error
import time
if __name__ == '__main__':
url="https://www.zhipin.com/job_detail/?query=%E9%85%8D%E7%BD%AE%E7%AE%A1%E7%90%86&city=101270100&industry=&position="
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
html=''
try:
req=request.Request(url,headers=headers)
resp=request.urlopen(req)
#time.sleep(4)
#html=resp.read().decode("utf-8")
#print(html)
except Exception as e:
if hasattr(e,'code'):
print("HTTPError "+e.code+" "+e.reason)
elif hasattr(e,'reason'):
print("URLError "+e.reason)
#boss
company_total={}
htmls=open('a.html','rb').read().decode('utf-8')
pat='<h3 class="name"><a href="(.*?)" title=".*?" ka=".*?" target="_blank">(.*?)</a></h3>'
# print('2222')
company=re.findall(pat,htmls)
#print(len(company))
for i in company:
#print(i[1]+" https://www.zhipin.com"+i[0])
print("companyinfo:https://www.zhipin.com"+i[0]+" companyname:"+i[1])
company_total[i[1]]={"companyinfo":"https://www.zhipin.com"+i[0]}
print(company_total)
pat='<p><a href=".*?" class="false-link" target="_blank" ka=".*?" title=".*?">.*?</a><em class="vline"></em>(.*?)<em class="vline"></em>(.*?)</p>'
company = re.findall(pat, htmls)
for i in company:
#print(i[1]+" https://www.zhipin.com"+i[0])
print("是否上市:"+i[0]+" 公司人数:"+i[1])
#company_total[i[0]]["isup"]="https://www.zhipin.com"+i[0]
#company_total[i[0]]["people"] =i[1]
print(company_total)
pat='<span class="red">(.*?)</span>'
company = re.findall(pat, htmls)
for i in company:
#print(i[1]+" https://www.zhipin.com"+i[0])
print("xinzhi:"+i)
#company_total[i[0]]["xinzhi"]=i[0]
pat='^<p>(.*?)<em class="vline"></em>(.*?)</p>'
company = re.findall(pat, htmls)
for i in company:
print("工作年限:" + i[0]+" 学历要求:"+i[1])
pat='<span class="job-area">(.*?)</span>'
company = re.findall(pat, htmls)
for i in company:
print("工作地点:" + i)
for key in company_total:
print("公司名:"+key)
urllib案例实战二:boss职位信息简单获取(re模块使用)
最新推荐文章于 2024-04-15 21:48:59 发布