#coding:utf-8 #__author__='wang' #python内置模块 import re,urllib2 from random import choice #Python第三方模块 import xlwt #定义智联招聘爬虫类 class ZLZP(object): """ work_name:用于接收岗位名称 like_citys:用于接收意向城市 """ def __init__(self,work_name,like_citys): self.work_name = work_name self.like_citys = like_citys self.total_page_num = 0 city_string = '' for city in self.like_citys: # 最后一个城市后面不需要添加%2B,在URL中表示+ if city == self.like_citys[-1]: city_string += city else: city_string += city city_string += '%2B' self.base_url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?' + 'jl=' + city_string + '&kw=' + self.work_name user_agent_list =["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Opera/8.0 (Windows NT 5.1; U; en)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ", "Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"] self.headers = {'User-Agent':choice(user_agent_list)} #定义获取HTML源代码的函数 def get_page_code(self,pageNum): #根据pageNum的值,拼接某一页的绝对地址 absulote_url = self.base_url + '&p=' +str(pageNum) request = urllib2.Request(absulote_url,headers=self.headers) try: response = urllib2.urlopen(request) except Exception,e: print '获取第{}页数据失败,原因是:{}'.format(pageNum,e) return None else: print '获取第{}页数据成功!'.format(pageNum) html = response.read() return html #定义获取网页总职位数的函数,根据职位是计算一共有多少页 def get_total_position(self,html): pattern = re.compile(r'<span class="search_yx_tj">.*?<em>(.*?) </em>',re.S) number = int(re.search(pattern,html).group(1)) if number: #计算总页数 if number%60 == 0: #每一个展示60个数据刚好将总职位数显示完 self.total_page_num = number/60 else: #剩余数据个数不够60个需要单独增加一页 self.total_page_num = number/60 +1 # else: print '获取总职位的正则有问题' return #定义获取职位信息发的函数 def get_all_data(self,html): pattern = re.compile(r'<table.*?class="newlist">.*?<td class="zwmc.*?<a.*?>(.*?)</a>.*?<td.*?class="fk_lv.*?<span>(.*?) </span>.*?<td class="gsmc.*?_blank">(.*?)</a>.*?<td class="zwyx"> (.*?)</td>.*?<td class="gzdd">(.*?)</td>',re.S) results = re.findall(pattern,html) data_list= [] for result in results: remove_element = re.compile(r'<.*?>',re.S) #过滤职位描述和公司名称 zwmc = re.sub(remove_element,'',result[0]) gsmc = re.sub(remove_element,'',result[2]) data_list.append((zwmc,result[1],gsmc,result[3],result[4])) return data_list #定义创建Excel表的函数 def open_excel_file(self): #创建工作表的对象 workbook = xlwt.Workbook(encoding= 'utf-8') #设置表头 sheet = workbook.add_sheet('Python职位表') #第一个参数表示行号,第二个参数表示列数 sheet.write(0,0,'职位名称') sheet.write(0,1,'反馈率') sheet.write(0,2,'公司名称') sheet.write(0,3,'职位月薪') sheet.write(0,4,'工作地点') return sheet,workbook #定义开始爬虫的函数 def stsrt_spider(self): #1.获取首页源代码 html = self.get_page_code(1) if html == None: print '首页源代获取失败' return #2.提取总职位数,获取总页数 self.get_total_position(html) #3.开始根据总页数循环写入数据 sheet, workbook =self.open_excel_file() #记录当前行号 recode_row = 1 for x in xrange(1,self.total_page_num+1): html = self.get_page_code(x) if html == None: #跳过本页写入数据 continue results_list = self.get_all_data(html) for zwmc,fk_lv,gsmc,zwyx,gzdd in results_list: #通过sheet对象按照行号和列好写入Excel表中 sheet.write(recode_row, 0,zwmc) sheet.write(recode_row, 1, fk_lv) sheet.write(recode_row, 2, gsmc) sheet.write(recode_row, 3, zwyx) sheet.write(recode_row, 4, gzdd) recode_row += 1 #通过workbook对象,将Excel数据保存到本地 workbook.save(u'Python职业介绍.xls') if __name__ == '__main__': # zlzp = ZLZP('python',['郑州','杭州']) # html = zlzp.get_page_code(1) # zlzp.get_total_position(html) # zlzp.get_all_data(html) workname = raw_input('请输入搜索职位') citys=[] while True: city_name = raw_input('请输入意向城市(Q退出输入):') if city_name =='Q': break citys.append(city_name) zlzp = ZLZP(workname,citys) zlzp.stsrt_spider()
智联招聘(保存Excel文件)
最新推荐文章于 2019-06-03 12:33:13 发布