import random, xlwt, re from urllib.request import Request, urlopen from urllib.parse import quote # quote():是对url地址中的中文进行编码的一个函数 # 'http://www.zhilianzhaopin.com?kw=Python工程师&name=张三' class ZLZPSpider(object): def __init__(self, kw, city_list): self.row = 1 self.kw = quote(kw) self.city_list = city_list self.user_agent = [ "User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1", "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" ] # 根据kw和city_list拼接绝对路径。 relation_url = '' for city in city_list: # %2B:+编码 if city == city_list[-1]: # 不需要拼接%2B relation_url += quote(city) else: # 需要拼接%2B relation_url += quote(city) relation_url += '%2B' self.url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl='+relation_url+'&kw='+self.kw+'&p=' def get_page_list(self, page_num): url = self.url + str(page_num) request = Request(url, headers={'User-Agent': random.choice(self.user_agent)}) try: response = urlopen(request) content = response.read().decode() except Exception as e: print('获取源代码失败, 原因:{}, 地址:{}'.format(e, url)) return None else: return content def parse_page_list(self,sheet, content): pattern = re.compile(r'<td class="zwmc".*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<td class="gsmc">.*?<a.*?target="_blank">(.*?)</a>.*?<td class="zwyx">(.*?)</td>.*?<td class="gzdd">(.*?)</td>', re.S) result_list = re.findall(pattern, content) self.write_data(sheet,result_list) # 将数据处理并保存 def open_file(self): # 1.创建workbook对象 book = xlwt.Workbook(encoding='utf-8') # 2.创建选项卡 sheet = book.add_sheet('职位简介') # 3.添加头 # 第一个参数是行,第二个参数是列 sheet.write(0, 0, '职位名称') sheet.write(0, 1, '职位详情') sheet.write(0, 2, '公司名称') sheet.write(0, 3, '职位月薪') sheet.write(0, 4, '工作地点') sheet.write(0, 5, '职位要求') return book, sheet def write_data(self,sheet,data): # [(), ()] for href,zwmc,gsmc,zwyx,gzdd in data: zwmc = re.sub('<b>|</b>', '', zwmc) sheet.write(self.row, 0, zwmc) sheet.write(self.row, 1, href) sheet.write(self.row, 2, gsmc) sheet.write(self.row, 3, zwyx) sheet.write(self.row, 4, gzdd) content = self.get_page_detail(href) self.parse_page_detail(content, sheet, self.row) self.row += 1 def get_page_detail(self, url): request = Request(url, headers={'User-Agent': random.choice(self.user_agent)}) try: response = urlopen(request) content = response.read().decode() except Exception as e: print('详情页异常,原因:{}, 地址:{}'.format(e, url)) return None return content def parse_page_detail(self, html, sheet, row): pattern = re.compile(r'<div class="tab-inner-cont">(.*?)</div>', re.S) result = re.search(pattern, html) if result != None: res = result.group() pattern_tag = re.compile(r'<.*?>') data = re.sub(pattern_tag, '', res).strip().replace('\n', '') sheet.write(row, 5, data) def close_file(self, book): book.save('Python工程师职位.xls') if __name__ == "__main__": obj = ZLZPSpider('Python开发工程师', ['郑州','北京','上海']) book, sheet = obj.open_file() html = obj.get_page_list(1) obj.parse_page_list(sheet, html) obj.close_file(book) # 'http://www.baidu.com/asdfasdfasdf.jpg' # # requests.content # 获取的是二进制数据 # requests.text # str类型的数据 # # open('C:/Desktop/1.jpg', 'wb') # f.write(requests.content)
正则爬虫实例
最新推荐文章于 2022-02-27 21:45:54 发布