网页展示
代码
import requests, re
import xlwt#excel写模块
#1- 创建一个excel文件对象
def excel_init():
workBook = xlwt.Workbook(encoding='utf-8')
#2- 在excel对象新建一个sheet
workSheet = workBook.add_sheet("51job")
colName = ['岗位','公司名称','地址','薪资','发布时间']
#excel--sheet 列与行都是从下标 0 开始的
for one in range(len(colName)):
#写列名
workSheet.write(0,one,colName[one])#write(行数,列数,字符串内容)
#最后一步--保存
return workBook,workSheet
#workBook.save('D:\51job.xls')
workBook, workSheet = excel_init() # 完成excel初始化操作
web_url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
# 设置读取网页的头部,该行代码主要用于模拟浏览器来访问网站
user_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
# https://search.51job.com/
# 51job
# ----封装一个 获取页数的函数
def get_pages(url):
# 1- 使用requests构建请求
# <span class="td">共28页,到第</span>
resp = requests.get(url, headers=user_header)
# 2- 获取响应数据
resp.encoding = 'gbk' # 设置编码
# print(resp.text)
pages = re.findall('"total_page":"(.*?)",', resp.text, re.S)[0].strip() #获取总页数
return int(pages)
print('----------->:', get_pages(web_url))
'''
反扒机制:
1- 判别是否是浏览器--后果:远程主机强迫关闭了一个现有的连接---解决方案:模拟浏览器
2- 验证码--
3- 封ip
4- 封账号
'''
row = 1 # 行号----初始值
for page in range(1, get_pages(web_url) + 1): #其实此时的page没有在后面用到
# 1- 使用requests构建请求
web_url = f'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
resp = requests.get(web_url, headers=user_header)
# 2- 获取响应数据
resp.encoding = 'gbk' # 设置编码
# print(resp.text)#响应内容--字符串
# print(resp.content)#byte格式
# 3- 提取有效数据
'''
"job_href":"https:\/\/jobs.51job.com\/shanghai-qpq\/128395583.html?s=01&t=0",
"job_name":"自动化测试工程师","job_title":"自动化测试工程师",
"company_href":"https:\/\/jobs.51job.com\/all\/co1980881.html",
"company_name":"上海宽文是风软件有限公司","providesalary_text":"1-1.5万\/月",
"workarea":"021600","workarea_text":"上海-青浦区","updatedate":"01-16","isIntern":"","iscommunicate":"",
"companytype_text":"民营公司","degreefrom":"6","workyear":"4","issuedate":"2021-01-16 04:00:41","isFromXyz":"",
"jobwelf":"五险一金 弹性工作","jobwelf_list":["五险一金","弹性工作"],job_href
"attribute_text":["上海-青浦区","2年经验","本科","招1人"],"companysize_text":"150-500人",
"companyind_text":"计算机软件","adid":""},{"type":"engine_search_result","jt":"0","tags":[],
"ad_track":"","jobid":"128090693","coid":"5131194","effect":"1","is_special_job":"",
"job_href":"https:\/\/jobs.51job.com\/shanghai-pdxq\/128090693.html?s=01&t=0",
"job_name":"自动化测试工程师","job_title":"自动化测试工程师",
"company_href":"https:\/\/jobs.51job.com\/all\/co5131194.html","company_name":"江苏金楷泽互联网科技有限公司",
"providesalary_text":"1-1.5万\/月","workarea":"021000","workarea_text":"上海-浦东新区",
"updatedate":"01-16","isIntern":"","iscommunicate":"","companytype_text":"民营公司","degreefrom":"6",
"workyear":"4","issuedate":"2021-01-16 04:00:41","isFromXyz":"","jobwelf":"","jobwelf_list":[""],"attribute_text":["上海-浦东新区","2年经验","本科","招2人"],"companysize_text":"50-150人","companyind_text":"计算机软件","adid":""},{"type":"engine_search_result","jt":"0","tags":[],"ad_track":"","jobid":"126661042","coid":"1505344","effect":"1",
"is_special_job":"",
'''
lines = re.findall('"job_href"(.*?)"is_special_job":"",', resp.text, re.S) # 返回是list
# 获取每一行具体数据
for line in lines:
jobName = re.findall('"job_name":"(.*?)","job_title"', line, re.S)
#print(jobName)
# 1- 获取岗位名称
workSheet.write(row, 0, jobName)
# 2- 获取公司名称
company = re.findall('"company_name":"(.*?)","providesalary_text"', line, re.S)
workSheet.write(row, 1, company)
# 3- 获取地址
address = re.findall('"workarea_text":"(.*?)","updatedate":', line, re.S)
workSheet.write(row, 2, address)
# 4- 获取薪资
salary = re.findall('"providesalary_text":"(.*?)","workarea"', line, re.S)
workSheet.write(row, 3, salary)
# 5- 发布时间
jobTime = re.findall('"updatedate":"(.*?)","isIntern"', line, re.S)
workSheet.write(row, 4, jobTime)
row += 1
# 4- 存储excel
workBook.save('D:\\pythonCourse\\51job.xls') # 关闭文件才操作
'''
1- https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
2- https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
'''
excel 保存获取结果