爬虫笔记13——网页爬取数据写入MySQL数据库，以阿里recruit为例

墨菲马

已于 2024-06-22 22:55:49 修改

阅读量1k

点赞数 6

分类专栏：爬虫笔记文章标签：数据库爬虫笔记

于 2024-06-19 23:41:35 首次发布

本文链接：https://blog.csdn.net/Yima_Dangxian/article/details/139797767

版权

爬虫笔记专栏收录该内容

27 篇文章 7 订阅

订阅专栏

下载pymysql库

需要下载pymysql库，以便于在程序中连接MySQL数据库

pip install pymysql
# 或者使用国内的镜像文件，安装会快一点
pip install pymysql -i https://pypi.douban.com/simple

需要安装MySQL，并创建使用数据库

安装MySQL可以看这里。
创建一个数据库：

create database 数据库名 charset=utf8;
use 数据库名;

在python程序中我们可以这样连接数据库：

import pymysql


def connectMySql():
	#连接数据库，使用上下文管理器方便执行结束自动退出
    with pymysql.connect(host='127.0.0.1', user='root', password='root', database='testdb') as db:
    	#创建游标，类似于鼠标吧，对到SQL语句执行
        with db.cursor() as cursor:
            cursor.execute('select * from users;')
            data = cursor.fetchall()
            print(data)

    print('程序结束！')


connectMySql()

以上是基本的数据库连接操作，下面例子是爬取阿里招聘岗位信息的爬虫程序：

import pymysql
import requests
import time


class SpiderALi:
	#python类创建实例对象自动运行
    def __init__(self):
        self.url = 'https://careers.aliyun.com/position/search'
        #headers其实只留下个‘User-Agent’就可以了，其他可以不用，这是cUrl复制过来的
        self.headers = {
            "accept": "application/json, text/plain, */*",
            "accept-language": "zh-CN,zh;q=0.9",
            "bx-v": "2.5.11",
            "cache-control": "no-cache",
            "content-type": "application/json",
            "origin": "https://careers.aliyun.com",
            "pragma": "no-cache",
            "priority": "u=1, i",
            "referer": "https://careers.aliyun.com/off-campus/position-list?lang=zh",
            "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
        }
        self.requestData = {
            'channel': 'group_official_site',
            'language': 'zh',
            'batchId': '',
            'categories': '',
            'deptCodes': [],
            'key': '',
            'pageIndex': 1,
            'pageSize': 10,
            'regions': '',
            'subCategories': '',
        }
        self.params = {
            '_csrf': '378212c5-83ef-42bb-a25c-3780f7f3ab09',
        }
        self.cookies = '去请求头里面找到cookies复制过来即可'
        self.connectMySql = pymysql.connect(host='127.0.0.1', user='root', password='你设置的密码，一般也是root', database='py_spider')
        self.cursor = self.connectMySql.cursor()
	
	#python类的生命周期，该魔术方法在实例对象销毁时自动执行
    def __del__(self):
        print('爬取结束，退出数据库！')
        self.cursor.close()
        self.connectMySql.close()
	
	#获取岗位信息
    def get_works(self):
        for i in range(1, 53):
            self.requestData["pageIndex"] = i
            response = requests.post(
                url=self.url,
                headers=self.headers,
                json=self.requestData,
                params=self.params,
                cookies=self.cookies
            ).json()
            yield response["content"]["datas"]
	
	#创建表，不存在则创建
    def create_table(self):
        sql = """
            create table if not exists ali_works (
                id int primary key auto_increment,
                work_name varchar(100) not null,
                categories varchar(50),
                workLocations varchar(50),
                requirement text,
                description text,
                nowtime date
            );
        """
        try:
            self.cursor.execute(
                sql
            )
            print('表创建成功！')
        except Exception as e:
            print('表创建失败：', e)

	#插入数据
    def insert_info(self, *args):
        """
        :param args:
        id int primary key auto_increment,
        work_name varchar(100) not null,
        categories varchar(50),
        workLocations varchar(50),
        requirement text,
        description text
        :return:
        """
        sql = """
            insert into ali_works value (%s, %s, %s, %s, %s, %s, %s)
        """
        try:
            self.cursor.execute(sql, args)
            print('数据插入成功')
            self.connectMySql.commit()
        except Exception as e:
            print('数据插入失败：', e)
            self.connectMySql.rollback()
	
	#主函数
    def main(self):
        self.create_table()
        time.sleep(2)
        allWorkLists = self.get_works()
        work_id = 0
        # 获取当前时间戳
        timestamp = time.time()
        # 转换为本地时间
        local_time = time.localtime(timestamp)
        # 格式化为日期
        date = time.strftime('%Y-%m-%d', local_time)
        for index, workLists in enumerate(allWorkLists):
            print(f'正在插入第{index+1}页数据')
            time.sleep(1.5)
            for work in workLists:
                work_name = work["name"]
                categories = work["categories"][0] if work["categories"] else 'null'
                workLocations = work["workLocations"]
                requirement = work["requirement"]
                description = work["description"]
                self.insert_info(work_id, work_name, categories, workLocations, requirement, description, date)


if __name__ == '__main__':
    spiderALi = SpiderALi()
    spiderALi.main()