title: 背景
通过用户表的用户id获取接口数据,接口只提供了根据id查询的功能
没有使用async的原因:不能控制并发量, 且速度不够快, 速度的瓶颈在于访问接口, 访问速度也不能太快
1、导入的包
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import pandas as pd
from sqlalchemy import create_engine
from retry import retry
from tqdm import tqdm
2、网络请求部分
url = "http://******/api/query"
params = {
"client": "***"
}
headers = {
"authorization": "******"
}
@retry(tries=2, delay=15)
def do_query(user_id):
payload = {
"UID": user_id,
"DATE": time.strftime('%Y-%m-%d')
}
response = requests.request("POST", url, json=payload, headers=headers, params=params).text
item_data = json.loads(response)["DATA"]
return pd.json_normalize(item_data)
3、创建数据库连接
@retry(tries=2, delay=15)
def get_engine():
username = ''
password = ''
host = ''
port = ''
return create_engine(f'***://{username}:{password}@{host}:{port}')
4、数据入库
def to_db(user_id):
item_df = do_query(user_id)
item_df['UID'] = user_id
item_df['DATE'] = time.strftime('%Y-%m-%d')
return item_df.to_sql(name='***', schema='***', if_exists='append', index=False, con=engine)
5、主程序
if __name__ == '__main__':
engine = get_engine()
sql = '''
查询查询用户id
'''
df = pd.read_sql(sql=sql, con=engine)
with (
tqdm(total=df.shape[0], unit='row', desc='Processing') as pbar,
ThreadPoolExecutor(max_workers=20) as executor
):
obj_list = []
for index, row in df.iterrows():
obj = executor.submit(to_db, row.values[0])
obj_list.append(obj)
for future in as_completed(obj_list):
pbar.update(1)
print('Done!')
6、输出日志
Processing: 40%|███▉ | 4000/10000 [2:30:00<2:35:20, 1.28row/s]