from bs4 import BeautifulSoup as bs
import urllib
import re
class Spider(object):
def __init__(self):
print('init')
self.begin_page=1
self.end_page=4
self.base_url='https://job.e0575.com/list.php?cIx=5&page='
def load_page(self):
print("load")
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0"}
for page in range(self.begin_page,self.end_page):
print(page)
url=self.base_url+str(page)
request=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(request)
html=response.read().decode('utf-8')
self.parse_page(html)
def parse_page(self,html):
print("parse")
html=bs(html,'lxml')
result1=html.select('li[class="bg1"]')
result2=html.select('li[class="bg3"]')
result1+=result2
items=[]
for site in result1:
item={}
name=site.find('span').text
detailLink=site.find('a').attrs['href']
wage=site.select('.dd1')[0].text[7:-5]
publishTime=site.select('.dd2')[0].text[7:-5]
companyname=site.find('a',{'class':None}).text[7:-6]
workrequest=site.find('a').attrs['title'].replace('\u3000',' ')
item['职位名称']=name
item['详情链接']=detailLink
item['工作薪酬']=wage
item['发布时间']=publishTime
item['发布公司']=companyname
item['工作要求']=workrequest
items.append(item)
self.save_file(items)
def save_file(self,items):
print('save')
file=open('job.json','ab')
file.write(str(items).encode())
file.close()
if __name__=='__main__':
spider=Spider()
spider.load_page()
import os
os.remove('job.json')