异步&并发
背景:有一个很大的dateframe,每一行都需要有一个接口查询的操作,正常使用apply写如下,执行非常慢
#原始使用apply lambda
def req_p(p1):
req_url = "https://xxxxx"+p1+ "xxxx"
re = requests.get(req_url)
return res
temp['a'] = temp.p.apply(lambda x: process(x))
改进:引入异步编程和并发请求,速度基本上提升100倍
import asyncio
import aiohttp
from urllib.parse import urlencode
import nest_asyncio
from tqdm.asyncio import tqdm as tqdm_asyncio
nest_asyncio.apply()
async def req_p(p):
req_url = "https://xxxxx"+p1+ "xxxx"
async with session.get(req_url) as res:
return res
async def parallel_process(df, func):
results = []
async with aiohttp.ClientSession() as session:
tasks = [func(session, row['p']) for _, row in df.iterrows()]
results = await asyncio.gather(*tasks)
return results
async def main():
temp['a'] = await parallel_process(temp, req_p)
for _ in tqdm_asyncio(range(len(temp))):
pass
asyncio.run(main())
pyspark dateframe和python dateframe互转
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
# Create a Spark DataFrame from a Pandas DataFrame using Arrow
df = spark.createDataFrame(pdf)
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df.toPandas()
# 对于一个包含两列的dataframe,使用如下三种方法可以转为以bike_no的值为key,infos为value的字典。
#方法1:循环dataframe,设置字典
# sch_in_dict2 = {}
# for index,row in sch_in_df.iterrows():
# sch_in_dict2[row['bike_no']] = row['infos']
#方法2:将bike_no设置为index,转置后to_dict()
# sch_in_dict3 = sch_in_df.set_index("bike_no").T.to_dict('list')
#方法3:将两列分别生成list再组成dict
bike_no_list = list(sch_in_df.bike_no)
infos_list = list(sch_in_df.infos)
sch_in_dict = dict(zip(bike_no_list,infos_list))
经过对比,方法三耗时最少,约前两种方法的1/20