# 正则爬取网站数据并存储到Excel表格中
import random, xlwt, re
from urllib.request import Request, urlopen
from urllib.parse import quote
# quote():是对url地址中的中文进行编码的一个函数
# 'http://www.zhilianzhaopin.com?kw=Python工程师&name=张三'
class ZLZPSpider(object):
def __init__(self, kw, city_list):
self.row = 1
self.kw = quote(kw)
self.city_list = city_list
self.user_agent = [
"User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
]
# 根据kw和city_list拼接绝对路径。
relation_url = ''
for city in city_list:
# %2B:+编码
if city == city_list[-1]:
# 不需要拼接%2B
relation_url += quote(city)
else:
# 需要拼接%2B
relation_url += quote(city)
relation_url += '%2B'
self.url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl='+relation_url+'&kw='+self.kw+'&p='
def get_page_list(self, page_num):
url = self.url + str(page_num)
request = Request(url, headers={'User-Agent': random.choice(self.user_agent)})
try:
response = urlopen(request)
content = response.read().decode()
except Exception as e:
print('获取源代码失败, 原因:{}, 地址:{}'.format(e, url))
return None
else:
return content
def parse_page_list(self,sheet, content):
pattern = re.compile(r'<td class="zwmc".*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<td class="gsmc">.*?<a.*?target="_blank">(.*?)</a>.*?<td class="zwyx">(.*?)</td>.*?<td class="gzdd">(.*?)</td>', re.S)
result_list = re.findall(pattern, content)
self.write_data(sheet,result_list)
# 将数据处理并保存
def open_file(self):
# 1.创建workbook对象
book = xlwt.Workbook(encoding='utf-8')
# 2.创建选项卡
sheet = book.add_sheet('职位简介')
# 3.添加头
# 第一个参数是行,第二个参数是列
sheet.write(0, 0, '职位名称')
sheet.write(0, 1, '职位详情')
sheet.write(0, 2, '公司名称')
sheet.write(0, 3, '职位月薪')
sheet.write(0, 4, '工作地点')
sheet.write(0, 5, '职位要求')
return book, sheet
def write_data(self,sheet,data):
# [(), ()]
for href,zwmc,gsmc,zwyx,gzdd in data:
zwmc = re.sub('<b>|</b>', '', zwmc)
sheet.write(self.row, 0, zwmc)
sheet.write(self.row, 1, href)
sheet.write(self.row, 2, gsmc)
sheet.write(self.row, 3, zwyx)
sheet.write(self.row, 4, gzdd)
content = self.get_page_detail(href)
self.parse_page_detail(content, sheet, self.row)
self.row += 1
def get_page_detail(self, url):
request = Request(url, headers={'User-Agent': random.choice(self.user_agent)})
try:
response = urlopen(request)
content = response.read().decode()
except Exception as e:
print('详情页异常,原因:{}, 地址:{}'.format(e, url))
return None
return content
def parse_page_detail(self, html, sheet, row):
pattern = re.compile(r'<div class="tab-inner-cont">(.*?)</div>', re.S)
result = re.search(pattern, html)
if result != None:
res = result.group()
pattern_tag = re.compile(r'<.*?>')
data = re.sub(pattern_tag, '', res).strip().replace('\n', '')
sheet.write(row, 5, data)
def close_file(self, book):
book.save('Python工程师职位.xls')
if __name__ == "__main__":
obj = ZLZPSpider('Python开发工程师', ['郑州','北京','上海'])
book, sheet = obj.open_file()
html = obj.get_page_list(1)
obj.parse_page_list(sheet, html)
obj.close_file(book)
# 'http://www.baidu.com/asdfasdfasdf.jpg'
#
# requests.content # 获取的是二进制数据
# requests.text # str类型的数据
#
# open('C:/Desktop/1.jpg', 'wb')
# f.write(requests.content)