主程序
import requests
import json
from lxml import etree
from mysqlhelper import MysqlHelper
import time
import random
class LaGouSpider():
def __init__(self):
# 选取热门城市
self.sqlHelper = MysqlHelper()
self.insertSql = '''INSERT INTO lagou VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
self.headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city=%E4%B8%8A%E6%B5%B7",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
self.area_list = ['北京', '上海', '深圳', '广州', '杭州', '成都', '南京', '武汉', '西安', '厦门', '长沙', '苏州', '天津']
self.index_url = "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}"
self.start_url = "https://www.lagou.com/jobs/positionAjax.json?"
self.get_pageNum = ""
def delay_time(self):
randtime = random.randint(4,6)
time.sleep(randtime)
def get_cookies(self, s,city):
response = requests.get(self.index_url.format(city), headers=self.headers)
return response.cookies
def get_num(self, city):
response = requests.get(self.index_url.format(city), headers=self.headers)
html_ele = etree.HTML(response.text)
pageNum = html_ele.xpath('//span[@class="span totalNum"]/text()')
if pageNum:
return pageNum[0]
else:
return 1
def get_response(self,s, url, city, pagenum):
start_params = {
"city": city,
"needAddtionalResult": "false"
}
start_data = {
"first": "false",
"pn": pagenum, # 页码
"kd": "数据分析"
}
response = s.post(url, headers=self.headers, cookies=self.cookies, params=start_params,
data=start_data)
return response
def save_data(self, city, positionName, district, companyShortName, salary, workYear, education, financeStage,
companySize,
companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,
firstType, thirdType, skillLables):
data = (city, positionName, district, companyShortName, salary, workYear, education, financeStage, companySize,
companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,
firstType, thirdType, skillLables)
self.sqlHelper.execute_modify_sql(self.insertSql, data)
def main(self):
for city in self.area_list:
pageNum = self.get_num(city)
self.delay_time()
for num in range(1, int(pageNum) + 1):
s = requests.Session()
self.cookies = self.get_cookies(s, city)
self.delay_time()
response = self.get_response(s,self.start_url, city, num)
print(response.text)
json_ele = json.loads(response.text)
zhaopin_list = json_ele['content']['positionResult']['result']
# print(zhaopin_list)
for info in zhaopin_list:
positionName = info['positionName']
district = info['district']
companyShortName = info['companyShortName']
salary = info['salary']
workYear = info['workYear']
education = info['education']
financeStage = info['financeStage'] # 融资
companySize = info['companySize']
companyLabelList = info['companyLabelList'] # 公司标签列表
companyLabel = ",".join(companyLabelList)
positionLablesList = info['positionLables'] # 小标签
if type(positionLablesList) == list:
positionLable = ",".join(positionLablesList)
else:
positionLable = info['positionLables']
longitude = info['longitude'] # 经度
latitude = info['latitude'] # 维度
formatCreateTime = info['formatCreateTime'] # 发布时间
companyFullName = info['companyFullName']
hitagsList = info['hitags'] # 福利
if type(hitagsList) == list:
hitags = ",".join(hitagsList)
else:
hitags = info['hitags']
firstType = info['firstType']
thirdType = info['thirdType']
skillLablesList = info['skillLables']
if type(skillLablesList) == list:
skillLables = ",".join(skillLablesList)
else:
skillLables = info['skillLables']
positionId = str(info['positionId'])
# print(city, positionName, district, companyShortName, salary, workYear, education, financeStage,
# companySize,
# companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,
# firstType, thirdType, skillLables)
print("[INFO]:",city,num)
# 保存数据
self.save_data(city, positionName, district, companyShortName, salary, workYear, education,
financeStage,
companySize,
companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName,
hitags,
firstType, thirdType, skillLables)
self.delay_time()
if __name__ == '__main__':
spider = LaGouSpider()
spider.main()
MysqlHelper类文件
import pymysql
class MysqlHelper(object):
def __init__(self):
self.conn = pymysql.connect(host='127.0.0.1', port=3306,
user='root', passwd='666666',
db='test', charset='utf8mb4')
self.cursor = self.conn.cursor()
def execute_modify_sql(self, sql, data):
self.cursor.execute(sql, data)
self.conn.commit()
def __del__(self):
self.cursor.close()
self.conn.close()