import json import urllib from urllib import parse, request import math # 请求头 headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Content-Length': '25', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'user_trace_token=20180702155921-d3d20412-7dcd-11e8-bccb-525400f775ce; LGUID=20180702155921-d3d2078c-7dcd-11e8-bccb-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; JSESSIONID=ABAAABAAAIAACBI79C85F71B2CEC5CEF072374DD0B0E6BF; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530518359,1530523033,1530578881; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530578881; LGSID=20180703084805-bf1518ef-7e5a-11e8-98e2-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_Java%3Fcity%3D%25E5%25B9%25BF%25E5%25B7%259E%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; LGRID=20180703084805-bf151b78-7e5a-11e8-98e2-5254005c3644; _ga=GA1.2.947372204.1530518359; _gid=GA1.2.1007997539.1530519627; SEARCH_ID=05d0d1e544af4a5e9c0dfe21533df3f9', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%B9%BF%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4482.400 QQBrowser/9.7.13001.400', 'X-Anit-Forge-Code': '0', 'X-Anit-Forge-Token': 'None', 'X-Requested-With': 'XMLHttpRequest' } # 获得相关网页数方法 def getPageNum(kw): # url = 'http://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false' # 路由(没有输查找关键字的路由) url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false' # form data data = { 'first': 'true', 'pn': '1', 'kd': kw } # url编码 data = urllib.parse.urlencode(data).encode('utf-8') # 请求体 req = urllib.request.Request(url, data=data, headers=headers) # POST请求 # 获取响应 response = urllib.request.urlopen(req).read().decode('utf-8') # 转json data = json.loads(response) # 获取岗位数 jobnum = data['content']['positionResult']['totalCount'] print(jobnum) # 获取单页岗位数 pagesize = data['content']['pageSize'] print(pagesize) # 获取页码数 totalpage = math.ceil(jobnum / pagesize) print(totalpage) return int(totalpage) # 获得岗位信息的方法 def getJobInfo(kw, pagenum): # url = 'http://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false' # 路由(没有输查找关键字的路由) url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false' for i in range(1, pagenum + 1): data = { 'first': 'true', 'pn': i, 'kd': kw } data = urllib.parse.urlencode(data).encode('utf-8') req = urllib.request.Request(url, data=data, headers=headers) # POST请求 response = urllib.request.urlopen(req).read().decode('utf-8') data = json.loads(response) joblist = data['content']['positionResult']['result'] # print(joblist) for job in joblist: city = job['city'] companyFullName = job['companyFullName'] companyLabelList = ['companyLabelList'] companyShortName = job['companyShortName'] companySize = job['companySize'] district = job['district'] education = job['education'] firstType = job['firstType'] hitags = job['hitags'] positionAdvantage = job['positionAdvantage'] positionLables = job['positionLables'] print(city, companyFullName, companyLabelList, companySize, district, education, firstType, hitags, positionAdvantage, positionLables) # 将爬取的结果保存到pythonJob.txt中 with open('pythonJob.txt', 'a+', encoding='utf-8', errors='ignore')as f: f.write( str((city, companyFullName, companyLabelList, companySize, district, education, firstType, hitags, positionAdvantage, positionLables)) + '\n') # 清除缓存 f.flush() if __name__ == '__main__': totalpage = getPageNum('python') getJobInfo('python', totalpage)
爬虫(5):爬取拉钩网数据
最新推荐文章于 2021-11-22 22:01:03 发布