一,先导入我们所需要的库,这里就需要request,pymysql,sqlalchemy库以及一个方法哦
import requests
import pymysql
import sqlalchemy as sq
from sqlalchemy.orm import sessionmaker
二,先定义我们的配置类,我们的配置文件就写在这个类里面了,后期可以用配置ConfigParser库放进配置文件中。
class Config:
kd = 'python'#搜索关键词
referer = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput='
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',#参数接受
'Referer': referer,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'}#360
三,在这里定义爬取数据的类
class Spider:
1,创建申请网站的方法,将配置文件中的文件写入
def __init__(self, kd=Config.kd):xian
self.kd = kd
self.url = Config.referer
self.api = 'https://www.lagou.com/jobs/positionAjax.json'
# 必须先请求referer网址
self.sess = requests.session()
self.sess.get(self.url, headers=Config.headers)
2,发起请求
def get_position(self, pn):
data = {'first': 'true',
'pn': str(pn),
'kd': self.kd
}
# 向API发起POST请求
r = self.sess.post(self.api, headers=Config.headers, data=data)
# 直接.json()解析数据
return r.json()['content']['positionResult']['result']
3,处理我们的数据写入三个列表中,这里我选取的是工资,工作年份,职称。
def engine(self, total_pn):
dataList = []
for pn in range(1, total_pn + 1):
results = self.get_position(pn)
for i in range(len(results)):
dataList.append({"salary":results[i]['salary'],"work_year":results[i]['workYear'],"username":results[i]['positionName']})
return dataList
四,这里创建我们的数据库并写入数据
1,连接数据库
class Data_Transport(object):
def __init__(self):
# 'latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
engine = sq.create_engine('mysql+pymysql://'mysql+pymysql://用户名:密码@数据库ip:接口/数据库名')
self.session = sessionmaker(bind=engine)()
# self.t_cousor = self.tbd.cursor()
2.建立模型
def create_table(self):lagou
sql = "create table if not exists lagoujob(id int auto_increment primary key," \
"username CHAR (255) not null," \
"work_year CHAR (255) not null," \
"salary CHAR (20) not null) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci"
self.session.execute(sql)
self.session.commit()
#建立模型
3,传入参数
def data_save(self, name, work_year, salary):
# pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax
sql = """INSERT INTO lagoujob (username, work_year,salary) VALUES("%s", "%s", "%s")""" % (
name, work_year, salary)#这里用到了一些sql的命令
try:
self.session.execute(sql)
self.session.commit()
print('插入cg')
except:
print('插入失败')
#传参
4,关闭数据库
def tear_down(self):
self.session.close()
#关闭模型
五,调用函数
if __name__ == '__main__':
lagou = Spider()
dataList = lagou.engine(2)
huoqu=Data_Transport()
huoqu.create_table()
for val in dataList:
huoqu.data_save(name=val["username"], work_year=val["work_year"], salary=val["salary"])
#这里为了插入数据库在外围添加了一个for循环
huoqu.tear_down()
六,整体代码实现
import requests
import pymysql
import sqlalchemy as sq
from sqlalchemy.orm import sessionmaker
class Config:
kd = 'python'#搜索关键词
referer = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput='
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',#参数接受
'Referer': referer,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'}#360
"""爬取数据"""
class Spider:
def __init__(self, kd=Config.kd):
self.kd = kd
self.url = Config.referer
self.api = 'https://www.lagou.com/jobs/positionAjax.json'
# 必须先请求referer网址
self.sess = requests.session()
self.sess.get(self.url, headers=Config.headers)
def get_position(self, pn):
data = {'first': 'true',
'pn': str(pn),
'kd': self.kd
}
# 向API发起POST请求
r = self.sess.post(self.api, headers=Config.headers, data=data)
# 直接.json()解析数据
return r.json()['content']['positionResult']['result']
def engine(self, total_pn):
dataList = []
for pn in range(1, total_pn + 1):
results = self.get_position(pn)
for i in range(len(results)):
dataList.append({"salary":results[i]['salary'],"work_year":results[i]['workYear'],"username":results[i]['positionName']})
return dataList
class Data_Transport(object):
def __init__(self):
# 'latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
engine = sq.create_engine('mysql+pymysql://'mysql+pymysql://用户名:密码@数据库ip:接口/数据库名')
self.session = sessionmaker(bind=engine)()
# self.t_cousor = self.tbd.cursor()
def create_table(self):
sql = "create table if not exists 51job(id int auto_increment primary key," \
"username CHAR (255) not null," \
"work_year CHAR (255) not null," \
"salary CHAR (20) not null) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci"
self.session.execute(sql)
self.session.commit()
#建立模型
def data_save(self, name, work_year, salary):
# pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax
sql = """INSERT INTO 51job (username, work_year,salary) VALUES("%s", "%s", "%s")""" % (
name, work_year, salary)
try:
self.session.execute(sql)
self.session.commit()
print('插入cg')
except:
print('插入失败')
#传参
def tear_down(self):
self.session.close()
#关闭模型
if __name__ == '__main__':
lagou = Spider()
dataList = lagou.engine(2)
huoqu=Data_Transport()
huoqu.create_table()
for val in dataList:
huoqu.data_save(name=val["username"], work_year=val["work_year"], salary=val["salary"])
huoqu.tear_down()