前言:小编也是现学现卖,方便自己记忆,写的不好的地方还请包涵,也欢迎各位大佬多多批评指正。
话不多说直接上代码,
1.首先编写的是基本的excel表格内容,使其形成函数方便调用。
import requests
import bs4
import openpyxl
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) \
Gecko/20100101 Firefox/77.0"}
def get_html(url,headers):
res = requests.get(url,headers=headers)
res.encoding = "gbk"
return res.text
def get_text(html,url):
"我的功能是html源码中获取中文信息"
jobname = []
company = []
money = []
jobneed = []
jobtemptation = []
jobinfo = []
cpinfo = []
soup = bs4.BeautifulSoup(html,"html.parser")
targets = soup.find("h1")
jobname.append(targets.text)
#print(jobname)
targets = soup.find("a",class_="catn")
company.append(targets.text)
targets = soup.find("div",class_="cn")
money.append(targets.strong.text)
targets = soup.find("p",class_="msg ltype")
jobneed.append(targets.text.replace("\xa0",""))
targets = soup.find_all("span",class_="sp4")
for i in targets:
stra += i.text + " "
jobtemptation.append(stra)
targets = soup.find("div",class_="bmsg job_msg inbox")
strb = ""
for i in targets.find_all("p"):
strb += i.text + " "
jobinfo.append(strb)
targets = soup.find("div",class_="tmsg inbox")
cpinfo.append(targets.text)
all_list = jobname+company+money+jobneed+jobtemptation+jobinfo+cpinfo
all_list.append(url)
return all_list
def save_info(all_list):
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["岗位名称","公司名称","薪资","岗位要求","职位诱惑","职位信息","公司信息","职位网站"])
ws.append(all_list)
wb.save("c:\\users\\spring\\desktop\\nz2001.xlsx")
if __name__ == "__main__":
#url = "https://jobs.51job.com/changsha-ylq/82873008.html?s=01&t=0"
#url = "https://jobs.51job.com/hunansheng/122942156.html?s=01&t=0" #演示获取别的主页信息
url = "https://jobs.51job.com/changsha-ylq/117480590.html?s=01&t=0"
html = get_html(url,headers)
all_list = get_text(html,url)
save_info(all_list)
2.调用上方编写好的函数,我这里函数名为 51jobpage。
import job51page #调用之前编写好的模块
import re #调用正则
import openpyxl #调用编写excel的表格
def get_joblist(reruler,html):
"我的功能是通过搜到的html岗位页面获取岗位地址列表列表"
joblist = re.findall(reruler,html)
#print(joblist)
return joblist
def save_info(joblist):
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["岗位名称","公司名称","薪资","岗位要求","职位诱惑","职位信息","公司信息","职位网站"])
for joburl in joblist:
html = job51page.get_html(joburl,job51page.headers) #逐一每个岗位的网页信息
jobinfolist = job51page.get_text(html,joburl) #逐一提取每个岗位信息中的文字信息
ws.append(jobinfolist)
wb.save("c:\\users\\spring\\desktop\\nz2001.xlsx")
if __name__ == "__main__":
url = "https://search.51job.com/list/190000%252C090200,\
000000,0000,00,9,99,渗透测试工程师,2,1.html"
reruler = "https://jobs.51job.com/.*/\d{9}.html"
html = job51page.get_html(url,job51page.headers) #调用job51paged.get_html 函数 实现访问页面和添加头部信息
joblist = get_joblist(reruler,html) #将reruler正则岗位各地区信息和html网页信息赋值给joblist
save_info(joblist)
3.如想搜索别的岗位的信息,直接更换搜索名称即可
注:我这里是以湖南和四川为例,具体城市可更换url进行搜索。