有待完善
import re
import requests
import os
import xlwt
import codecs
class Item(object):
zhiwei = None
gongzi = None
gongzuodidian = None
gongsimingcheng = None
class getPosition(object):
def __init__(self):
self.urlBase = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%85%A8%E5%9B%BD&kw=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&sm=0&isfilter=0&fl=489&isadv=0&sg=aae5284f62664af8b14611bda6d68315&p=1'
self.urls = []
self.items = []
self.getUrls(2)
# self.getHTML(self.urls)
self.spider(self.urls)
self.save(self.items)
def getHTML(self,url):
kv = {'user-agent':'Mozilla/5.0'}
r = requests.get(url,headers = kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def getUrls(self,pages):
urlHead = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%85%A8%E5%9B%BD&kw=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&sm=0&isfilter=0&fl=489&isadv=0&sg=aae5284f62664af8b14611bda6d68315&p='
for i in range(1,pages+1):
url = urlHead + str(i)
self.urls.append(url)
def spider(self,urls):
pat1 = '<b>(.*?)</b>'
pat3 = '<td class="zwyx">(.*?)</td>'
pat4 = '<td class="gzdd">(.*?)</td>'
pat5 = 'target="_blank">(.*?)<'
item = Item()
for url in urls:
html = self.getHTML(url)
item.zhiwei = re.compile(pat1).findall(html)
item.gongzi = re.compile(pat3).findall(html)
item.gongzuodidian = re.compile(pat4).findall(html)
item.gongsimingcheng = re.compile(pat5).findall(html)
self.items.append(item)
print(len(item.zhiwei))
print(len(item.gongzi))
def save(self,items):
fileName = '1a.txt'.encode('GBK')
tplt = "{0:^10}\t{1:<10}\t{2:^10}\t{3:^10}"
with codecs.open(fileName,'w','utf-8') as f:
for item in items:
# f.write(tplt.format(item.zhiwei,item.gongzi,item.gongzuodidian,item.gongsimingcheng))
# f.write("%s \t %s \t %s \t %s \r\n" % (' '.join(item.zhiwei),item.gongzi,item.gongzuodidian,item.gongsimingcheng))
for i in range(59):
print(tplt.format(item.zhiwei[i],item.gongzi[i],item.gongzuodidian[i],item.gongsimingcheng[i]))
# print(item.zhiwei[i])
# print(item.gongzi[i])
if __name__ == '__main__':
p = getPosition()