import requests
from lxml import etree
# 取得html
def getHtml(html):
url=html
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
try:
response=requests.get(url,headers=headers) # get请求
# print(response.status_code) # 测试
# response.encoding="utf-8" #编码
response.encoding='GBK'
html=response.text
# print(html)
return html
except ReadTimeout:
print("time out")
except ConnectionError:
print("connection error")
except RequestException:
print("request error")
def changeString(str):
a=str.replace(" ","").replace("\n","").replace("\r","")
return a
def getInformation(html):
results=[]
html=etree.HTML(html,etree.HTMLParser())
result1=html.xpath('//*[@id="resultList"]/div[*]/p/span/a/text()')
result2=html.xpath('//*[@id="resultList"]/div[*]/span[1]/a/text()')
result3=html.xpath('//*[@id="resultList"]/div[*]/span[2]/text()')
result4=html.xpath('//*[@id="resultList"]/div[*]/span[3]/text()')
result5=html.xpath('//*[@id="resultList"]/div[*]/span[4]/text()')
for i in range(len(result1)):
results.append([changeString(result1[i]),changeString(result2[i]),changeString(result3[i+1]),changeString(result4[i+1]),changeString(result5[i+1])])
return results
def printInformation(data):
tplt = "{0:^15}\t{1:^20}\t{2:^15}\t{3:^15}\t{4:^15}\t"
print(tplt.format("职位名","公司名","工作地点","薪资","发布时间",chr(12288)))
for i in data:
print(tplt.format(i[0],i[1],i[2],i[3],i[4],chr(12288)))
# 保存数据
def store(a):
with open("./hahaha.txt","w+",encoding="utf-8") as file:
tplt = "{0:^15}\t{1:^15}\t{2:^15}\t{3:^15}\t{4:^15}\t"
file.write(tplt.format("职位名","公司名","工作地点","薪资","发布时间",chr(12288))+"\n")
for i in a:
file.write(tplt.format(i[0],i[1],i[2],i[3],i[4],chr(12288))+"\n")
return None
def main():
url="https://search.51job.com/list/120000,000000,0000,32,9,99,Java%25E5%25BC%2580%25E5%258F%2591,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
html=getHtml(url)
results=getInformation(html)
printInformation(results)
store(results)
print("OK")
main()
网络爬虫案例——前程无忧网java岗位
最新推荐文章于 2024-04-01 15:59:06 发布