from urllib import request from urllib import parse from bs4 import BeautifulSoup import csv # 管理json数据的模块的 import json # 定义智联的爬虫类 class ZhiLianSpider(object): def __init__(self,url,area,job,startpage,endpage): # 路由 self.url = url # 工作的地点 self.area = area # 工作的岗位 self.job = job # 爬取的起始页 self.startpage = startpage # 爬取的结束页 self.endpage = endpage # 请求头 self.headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML,like Gecko)Chrome/65.0.3325.181 Safari/537.36" } # 该方法用于处理url成一个对象 def handleurl(self,page): data={ "jl":self.area, "kw":self.job, "p":page } # 对路由中的中文字符进行编码 data = parse.urlencode(data) url = self.url+data # 把请求路由转换成对象 req = request.Request(url = url,headers=self.headers) return req # 该方法用于提取所需要的内容 def download(self,req): res = request.urlopen(req) #解析内容 soup = BeautifulSoup(res,"lxml") tblist = soup.select("#newlist_list_content_table .newlist")[1:] list1 = [] for tb in tblist: # 定义一个空字典.用于存储获取的数据 item = {} # 提取岗位 zwmc = tb.select(".zwmc div a")[0].get_text() # 提取公司 gsmc = tb.select(".gsmc a")[0].get_text() # 提取薪资 zwyx = tb.select(".zwyx")[0].get_text() # 提取地点 gzdd = tb.select(".gzdd")[0].get_text() item['zwmc']=zwmc item['gsmc']=gsmc item['zwyx']=zwyx item['gzdd']=gzdd list1.append(item) # print(zwmc, "\t", gsmc, "\t", zwyx, "\t", gzdd) return list1 # 该方法用于调用上面的两个方法 def start(self): # 定义一个大的列表用于将所有页面的职位信息进行整合 jobList = [] for page in range(self.startpage,self.endpage+1): req = self.handleurl(str(page)) itemsList = self.download(req) jobList += itemsList # 【数据存储】写入json数据 for ha in jobList: with open("幼师.txt","a",encoding="utf-8") as fp: txtList = "" for key, value in ha.items(): txtList += value+"\t" fp.write(txtList+"\n\n\n") jsonObj = json.dumps(jobList) # 把数据存储到csv文件 filenames = ["zwmc","gsmc","zwyx","gzdd"] for ha in jobList: with open("幼师.csv","a",errors="ignore") as fp: f_csv=csv.DictWriter(fp,fieldnames=filenames) f_csv.writerow(ha) # 写入json文件 with open("zhilian.json","w",encoding="utf-8") as fp: fp.write(jsonObj) if __name__ == '__main__': url = "http://sou.zhaopin.com/jobs/searchresult.ashx?" area = input("请输入工作地点:") job = input("请输入职位:") startpage = int(input("请输入其实页:")) endpage = int(input("请输入结束页:")) # 创建一个ZhiLianSpider对象 spiderObj = ZhiLianSpider(url,area,job,startpage,endpage) spiderObj.start()
运行后的csv文件的截图: