import pymysql
import requests
import asyncio
import aiohttp
from fake_useragent import UserAgent
from pandas import read_excel, DataFrame
# 并发量
CONCURRENCY = 10
semaphore = asyncio.Semaphore(CONCURRENCY)
session = None
base_url = 'www.baidu.com'
# 获取ip
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()
# 删除ip
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
async def getDetail(name):
async with semaphore:
con = pymysql.connect(
host='localhost',
user='root',
passwd='xxxx',
db='xxxx',
charset='utf8'
)
c = con.cursor()
List = []
for id in List:
dic = {'': '', }
ua = UserAgent().random
headers = {
'user-agent': ua,
}
json_data = {
}
# 设置重试次数
retry_count = 3
proxy = get_proxy().get("proxy")
while True:
try:
async with session.post(base_url, proxy=f'http://{proxy}',
headers=headers, json=json_data, timeout=5) as response:
res = await response.json()
# 解析json数据
try:
dic[''] = res['']
except:
pass
print(dic)
data = DataFrame([dic])
tableName = ''
DBUtils.insert_data(DBUtils(con, c, data, tableName))
break
except Exception as e:
print(e)
retry_count -= 1
if retry_count == 0:
delete_proxy(proxy)
retry_count = 3
proxy = get_proxy().get("proxy")
c.close()
con.close()
async def main():
global session
session = aiohttp.ClientSession()
List = read_excel(r'文件路径')['列名'].values
await asyncio.gather(*[getDetail(name) for name in List])
class DBUtils:
"""
数据库工具类
"""
""":param
db: 数据库连接: db = pymysql.connect(host='192.168.1.1', user='root', password='1234', port=3306, db='database_name')
cursor: 数据库游标: cursor = db.cursor()
data: 需写入数据: Dataframe
table: 写入表名
"""
def __init__(self, db, cursor, data, table):
self.db = db
self.cursor = cursor
self.data = data
self.table = table
# 按主键去重追加更新
def insert_data(self):
self.cursor.execute(
"CREATE TABLE IF NOT EXISTS `{}`(`酒店id` int(0) NOT NULL,`酒店名称` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`品牌` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`档次` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`省份` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`城市` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`商圈` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`地址` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`经度` decimal(10, 7) NULL DEFAULT NULL,`纬度` decimal(10, 7) NULL DEFAULT NULL,`房间数` int(0) NULL DEFAULT NULL,`开业时间` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`装修时间` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`简介` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL,`评分` varchar(5) NULL DEFAULT NULL,`房型数` int(0) NULL DEFAULT NULL,`电话` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`房型详情` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL,PRIMARY KEY (`酒店id`) USING BTREE)ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;".format(
self.table)
)
keys = ', '.join('`' + self.data.keys() + '`')
values = ', '.join(['%s'] * len(self.data.columns))
# 根据表的唯一主键去重追加更新
sql = 'INSERT INTO `{table}` ({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format(table=self.table,
keys=keys,
values=values)
update = ','.join(["`{key}` = %s".format(key=key) for key in self.data])
sql += update
for i in range(len(self.data)):
try:
self.cursor.execute(sql, tuple(self.data.loc[i]) * 2)
# print('正在写入第%d条数据' % (i + 1))
self.db.commit()
except Exception as e:
print("数据写入失败,原因为:%s", e)
self.db.rollback()
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
功能:将接口爬取的数据转换成DataFrame格式后直接存入mysql数据库中
需要搭配ip代理池使用,当然如果不考虑反爬也可以不挂ip