# -*- coding: utf-8 -*- # @Time : 2017/8/29 15:14 # @Author : z # @File : 拉勾网.py # @Software: PyCharm import requests from urllib.parse import urlencode from bs4 import BeautifulSoup import json import pandas import time class LaGou(object): def __init__(self,kd='python爬虫'): self.url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0" self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'Host':'www.lagou.com'} self.kd = kd self.list=[] def parse_html(self): self.list.append(['公司', "福利", '地址', '岗位', '薪资', '发布时间', '学历', '工作经验']) for i in range(1, 20): self.data = {'kd': self.kd, 'pn': i, 'first': 'true'} while True: try: response = requests.post(self.url, headers=self.headers, data=self.data).text json_response = json.loads(response) list_all = json_response['content']['positionResult']['result'] break except: time.sleep(1) print('------------------------', i, '-------------------') for i in list_all: list1 = [] list1.append(i['companyFullName']) list1.append(','.join(i['companyLabelList'])) list1.append(i['district']) list1.append(i['positionName']) list1.append(i['salary']) list1.append(i['createTime']) list1.append(i['education']) list1.append(i['workYear']) self.list.append(list1) self.to_file(self.list) def to_file(self,list): pd = pandas.DataFrame(self.list) pd.to_excel('gg.xls')if __name__ == '__main__':LaGou().parse_html()
爬取拉勾网
最新推荐文章于 2024-04-29 13:15:03 发布