关于51job网站的爬虫
一、原由
学校里有要求每日收集相应就业信息内容
二、查看51job网站内容
三、部分代码展示
def create_excel(excel_data):
localtime = time.localtime(time.time())
name = str(localtime.tm_mon) + str(localtime.tm_mday) + college + ".xlsx"
app = xw.App(visible=True, add_book=False)
# 新建工作簿
wb = app.books.add()
# 存储excel
# wb.save('example.xlsx')
# 引用工作表
sht = wb.sheets['sheet1']
sht.range('A1').options(expand='table').value = excel_data
print(sht.range('A1').value)
wb.save(name)
# 退出工作簿
wb.close()
# 推出excel
app.quit()
return
def write_excel(excels):
for excel in excels:
for word in excel:
print(word)
return
# 上海 软件工程
def get_html(job_pos, search):
target_url = get_url(job_pos, 3, urllib.parse.quote(search))
print(target_url)
response = request.urlopen(target_url)
html_doc = response.read().decode('gbk')
soup = BeautifulSoup(html_doc, 'html.parser')
div_test = soup.find_all("script")
# div_test = div_test.find_all("script")
# print(div_test)
ans = re.findall(r"window.__SEARCH_RESULT__\s=\s({.*})", div_test.__str__())
ans = re.findall(r"job_href\":\"(https:[^\"]*t=0)", ans[0])
for an in ans:
job_list.append(str(an).replace("\\", ""))
return job_list
# 收集职业就业网页信息
def get_information(aim_websites):
i = 1
job_information_list.append(type_table)
for website in aim_websites:
job_information = []
if i == 11:
break
# "https://jobs.51job.com/shanghai-ypq/125300004.html?s=01&t=0"
response = request.urlopen(website)
html_doc = response.read().decode('gbk')
# print(html_doc)
soup = BeautifulSoup(html_doc, 'html.parser')
# 输出公司名字
company_name = soup.find(class_="com_msg").p['title']
# 输出公司类型
company_type = soup.find(class_="com_tag").p['title']
# 输出招聘信息
cn = soup.find(class_="cn")
job_name = cn.h1['title']
stuff_info = soup.find(class_="cn").find(class_="msg ltype")['title']
ans = stuff_info.split("|")
# 生成相应内容
detail = []
aim_create.append(type_table)
for an in ans:
detail.append(str(an).replace(u'\xa0', u''))
print('--------------------' + str(i) + '----------------------')
# 序号
# print("序号:")
job_information.append(i)
i += 1
# 类别
# print("类别:")
job_information.append(college)
# 创建时间
# print("创建时间:")
localtime = time.localtime(time.time())
name = str(localtime.tm_year) + '/' + str(localtime.tm_mon) + '/' + str(localtime.tm_mday)
job_information.append(name)
# 过期时间
# print("过期时间:")
job_information.append('')
# 工作地点
# print("工作地点:" + job_list[1])
job_information.append(detail[0])
# 公司名称
# print("公司名称:" + company_name)
job_information.append(company_name)
# 链接地址
# print("链接地址:")
job_information.append(website)
# 职务名称
# print("职务名称:" + job_name)
job_information.append(job_name)
# 是否推荐
job_information.append('')
# 公司性质
job_information.append(company_type)
# 职务性质
job_information.append('实习')
# 教育背景
# print("教育背景" + job_list[2])
job_information.append(detail[2])
# 信用良好
# print("信用良好:" + '1')
job_information.append('1')
# 不良信用
# print("不良信用")
job_information.append('')
# 500强
# print("500强")
job_information.append('')
# 上市
# print("上市")
job_information.append('')
# 200人以上
# print("200人以上")
job_information.append('1')
# 200人以下
# print("200人以下")
job_information.append('')
# 岗位数
# print("岗位数:" + '1')
job_information.append('1')
# 需求人数
# print("需求数:" + job_list[3])
job_information.append(detail[3])
job_information_list.append(job_information.copy())
return job_information_list
def get_pos():
return
# 生成搜索接口
def get_url(pos_name, num, search):
postion = pos[pos_name]
num = company_size[3]
target = url + "/list/"+postion+",000000,0000,00,9,99,"+urllib.parse.quote(search)+",2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize="+num+"&ord_field=0&dibiaoid=0&line=&welfare="
return target
def turn_page(source):
ans_list = []
for i in range(1, 10):
ans = re.sub(r',(\d*).html', "," + str(i) + ".html", source)
ans_list.append(ans)
return ans_list
def position(chars):
string = ""
for char in chars:
string = char + ""
return string
def to_unicode(string):
ret = ''
for v in string:
ret = ret + hex(ord(v)).upper().replace('0X', '\\u')
return ret
四、效果展示
部分内容有误 需求人数/教育背景/发布时间有误,由于本身位置有误(有待改进)