下载pymysql库
需要下载pymysql库,以便于在程序中连接MySQL数据库
pip install pymysql
# 或者使用国内的镜像文件,安装会快一点
pip install pymysql -i https://pypi.douban.com/simple
需要安装MySQL,并创建使用数据库
安装MySQL可以看这里。
创建一个数据库:
create database 数据库名 charset=utf8;
use 数据库名;
在python程序中我们可以这样连接数据库:
import pymysql
def connectMySql():
#连接数据库,使用上下文管理器方便执行结束自动退出
with pymysql.connect(host='127.0.0.1', user='root', password='root', database='testdb') as db:
#创建游标,类似于鼠标吧,对到SQL语句执行
with db.cursor() as cursor:
cursor.execute('select * from users;')
data = cursor.fetchall()
print(data)
print('程序结束!')
connectMySql()
以上是基本的数据库连接操作,下面例子是爬取阿里招聘岗位信息的爬虫程序:
import pymysql
import requests
import time
class SpiderALi:
#python类创建实例对象自动运行
def __init__(self):
self.url = 'https://careers.aliyun.com/position/search'
#headers其实只留下个‘User-Agent’就可以了,其他可以不用,这是cUrl复制过来的
self.headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9",
"bx-v": "2.5.11",
"cache-control": "no-cache",
"content-type": "application/json",
"origin": "https://careers.aliyun.com",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://careers.aliyun.com/off-campus/position-list?lang=zh",
"sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}
self.requestData = {
'channel': 'group_official_site',
'language': 'zh',
'batchId': '',
'categories': '',
'deptCodes': [],
'key': '',
'pageIndex': 1,
'pageSize': 10,
'regions': '',
'subCategories': '',
}
self.params = {
'_csrf': '378212c5-83ef-42bb-a25c-3780f7f3ab09',
}
self.cookies = '去请求头里面找到cookies复制过来即可'
self.connectMySql = pymysql.connect(host='127.0.0.1', user='root', password='你设置的密码,一般也是root', database='py_spider')
self.cursor = self.connectMySql.cursor()
#python类的生命周期,该魔术方法在实例对象销毁时自动执行
def __del__(self):
print('爬取结束,退出数据库!')
self.cursor.close()
self.connectMySql.close()
#获取岗位信息
def get_works(self):
for i in range(1, 53):
self.requestData["pageIndex"] = i
response = requests.post(
url=self.url,
headers=self.headers,
json=self.requestData,
params=self.params,
cookies=self.cookies
).json()
yield response["content"]["datas"]
#创建表,不存在则创建
def create_table(self):
sql = """
create table if not exists ali_works (
id int primary key auto_increment,
work_name varchar(100) not null,
categories varchar(50),
workLocations varchar(50),
requirement text,
description text,
nowtime date
);
"""
try:
self.cursor.execute(
sql
)
print('表创建成功!')
except Exception as e:
print('表创建失败:', e)
#插入数据
def insert_info(self, *args):
"""
:param args:
id int primary key auto_increment,
work_name varchar(100) not null,
categories varchar(50),
workLocations varchar(50),
requirement text,
description text
:return:
"""
sql = """
insert into ali_works value (%s, %s, %s, %s, %s, %s, %s)
"""
try:
self.cursor.execute(sql, args)
print('数据插入成功')
self.connectMySql.commit()
except Exception as e:
print('数据插入失败:', e)
self.connectMySql.rollback()
#主函数
def main(self):
self.create_table()
time.sleep(2)
allWorkLists = self.get_works()
work_id = 0
# 获取当前时间戳
timestamp = time.time()
# 转换为本地时间
local_time = time.localtime(timestamp)
# 格式化为日期
date = time.strftime('%Y-%m-%d', local_time)
for index, workLists in enumerate(allWorkLists):
print(f'正在插入第{index+1}页数据')
time.sleep(1.5)
for work in workLists:
work_name = work["name"]
categories = work["categories"][0] if work["categories"] else 'null'
workLocations = work["workLocations"]
requirement = work["requirement"]
description = work["description"]
self.insert_info(work_id, work_name, categories, workLocations, requirement, description, date)
if __name__ == '__main__':
spiderALi = SpiderALi()
spiderALi.main()