数据第一步:爬虫爬取拉勾网数据入数据库

一,先导入我们所需要的库,这里就需要request,pymysql,sqlalchemy库以及一个方法哦

import requests
import pymysql
import sqlalchemy as sq
from sqlalchemy.orm import sessionmaker

二,先定义我们的配置类,我们的配置文件就写在这个类里面了,后期可以用配置ConfigParser库放进配置文件中。

class Config:

    kd = 'python'#搜索关键词
    referer = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput='
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',#参数接受
        'Referer': referer,
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'}#360

三,在这里定义爬取数据的类

class Spider:

1,创建申请网站的方法,将配置文件中的文件写入

def __init__(self, kd=Config.kd):xian
    self.kd = kd
    self.url = Config.referer
    self.api = 'https://www.lagou.com/jobs/positionAjax.json'

    # 必须先请求referer网址
    self.sess = requests.session()
    self.sess.get(self.url, headers=Config.headers)

2,发起请求

def get_position(self, pn):
    data = {'first': 'true',
            'pn': str(pn),
            'kd': self.kd
            }
    # 向API发起POST请求
    r = self.sess.post(self.api, headers=Config.headers, data=data)

    # 直接.json()解析数据
    return r.json()['content']['positionResult']['result']

3,处理我们的数据写入三个列表中,这里我选取的是工资,工作年份,职称。

  def engine(self, total_pn):
        dataList  = []
        for pn in range(1, total_pn + 1):
            results = self.get_position(pn)
            for i in range(len(results)):
                dataList.append({"salary":results[i]['salary'],"work_year":results[i]['workYear'],"username":results[i]['positionName']})
        return dataList

四,这里创建我们的数据库并写入数据
1,连接数据库

class Data_Transport(object):
    def __init__(self):
        # 'latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
        engine = sq.create_engine('mysql+pymysql://'mysql+pymysql://用户名:密码@数据库ip:接口/数据库名')
        self.session = sessionmaker(bind=engine)()
        # self.t_cousor = self.tbd.cursor()

2.建立模型

 def create_table(self):lagou
        sql = "create table if not exists lagoujob(id int auto_increment primary key," \
              "username CHAR (255) not null," \
              "work_year CHAR (255) not null," \
              "salary CHAR (20) not null) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci"
        self.session.execute(sql)
        self.session.commit()
        #建立模型

3,传入参数

  def data_save(self, name, work_year, salary):
        # pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax
        sql = """INSERT INTO lagoujob (username, work_year,salary) VALUES("%s", "%s", "%s")""" % (
        name, work_year, salary)#这里用到了一些sql的命令
        try:
            self.session.execute(sql)
            self.session.commit()
            print('插入cg')
        except:
            print('插入失败')
        #传参

4,关闭数据库

  def tear_down(self):
        self.session.close()
        #关闭模型

五,调用函数

if __name__ == '__main__':
    lagou = Spider()
    dataList = lagou.engine(2)
    huoqu=Data_Transport()
    huoqu.create_table()
    for val  in dataList:
        huoqu.data_save(name=val["username"], work_year=val["work_year"], salary=val["salary"])
        #这里为了插入数据库在外围添加了一个for循环
    huoqu.tear_down()

六,整体代码实现

import requests
import pymysql
import sqlalchemy as sq
from sqlalchemy.orm import sessionmaker

class Config:

    kd = 'python'#搜索关键词
    referer = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput='
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',#参数接受
        'Referer': referer,
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'}#360

"""爬取数据"""
class Spider:

    def __init__(self, kd=Config.kd):
        self.kd = kd
        self.url = Config.referer
        self.api = 'https://www.lagou.com/jobs/positionAjax.json'

        # 必须先请求referer网址
        self.sess = requests.session()
        self.sess.get(self.url, headers=Config.headers)

    def get_position(self, pn):
        data = {'first': 'true',
                'pn': str(pn),
                'kd': self.kd
                }
        # 向API发起POST请求
        r = self.sess.post(self.api, headers=Config.headers, data=data)

        # 直接.json()解析数据
        return r.json()['content']['positionResult']['result']

    def engine(self, total_pn):
        dataList  = []
        for pn in range(1, total_pn + 1):
            results = self.get_position(pn)
            for i in range(len(results)):
                dataList.append({"salary":results[i]['salary'],"work_year":results[i]['workYear'],"username":results[i]['positionName']})
        return dataList

class Data_Transport(object):
    def __init__(self):
        # 'latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
        engine = sq.create_engine('mysql+pymysql://'mysql+pymysql://用户名:密码@数据库ip:接口/数据库名')
        self.session = sessionmaker(bind=engine)()
        # self.t_cousor = self.tbd.cursor()
    def create_table(self):
        sql = "create table if not exists 51job(id int auto_increment primary key," \
              "username CHAR (255) not null," \
              "work_year CHAR (255) not null," \
              "salary CHAR (20) not null) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci"
        self.session.execute(sql)
        self.session.commit()
        #建立模型
    def data_save(self, name, work_year, salary):
        # pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax
        sql = """INSERT INTO 51job (username, work_year,salary) VALUES("%s", "%s", "%s")""" % (
        name, work_year, salary)
        try:
            self.session.execute(sql)
            self.session.commit()
            print('插入cg')
        except:
            print('插入失败')
        #传参
    def tear_down(self):
        self.session.close()
        #关闭模型



if __name__ == '__main__':
    lagou = Spider()
    dataList = lagou.engine(2)
    huoqu=Data_Transport()
    huoqu.create_table()
    for val  in dataList:
        huoqu.data_save(name=val["username"], work_year=val["work_year"], salary=val["salary"])
    huoqu.tear_down()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值