python 多进程 内存增长_Python多进程处理:如何将大量数据放入有限内存

#import the necessary packages

import pandas as pd

import us

import numpy as np

from multiprocessing import Pool,cpu_count,Queue,Manager

# the data in one particular column was number in the form that horrible excel version

# of a number where '12000' is '12,000' with that beautiful useless comma in there.

# did I mention I excel bothers me?

# instead of converting the number right away, we only convert them when we need to

def median_maker(column):

return np.median([int(x.replace(',','')) for x in column])

# dictionary_of_dataframes contains a dataframe with information for each title; e.g title is 'Data Scientist'

# related_title_score_df is the dataframe of information for the title; columns = ['title','score']

### where title is a similar_title and score is how closely the two are related, e.g. 'Data Analyst', 0.871

# code_title_df contains columns ['code','title']

# oes_data_df is a HUGE dataframe with all of the Bureau of Labor Statistics(BLS) data for a given time period (YAY FREE DATA, BOO BAD CENSUS DATA!)

def job_title_location_matcher(title,location):

try:

related_title_score_df = dictionary_of_dataframes[title]

# we limit dataframe1 to only those related_titles that are above

# a previously established threshold

related_title_score_df = related_title_score_df[title_score_df['score']>80]

#we merge the related titles with another table and its codes

codes_relTitles_scores = pd.merge(code_title_df,related_title_score_df)

codes_relTitles_scores = codes_relTitles_scores.drop_duplicates()

# merge the two dataframes by the codes

merged_df = pd.merge(codes_relTitles_scores, oes_data_df)

#limit the BLS data to the state we want

all_merged = merged_df[merged_df['area_title']==str(us.states.lookup(location).name)]

#calculate some summary statistics for the time we want

group_med_emp,group_mean,group_pct10,group_pct25,group_median,group_pct75,group_pct90 = all_merged[['tot_emp','a_mean','a_pct10','a_pct25','a_median','a_pct75','a_pct90']].apply(median_maker)

row = [title,location,group_med_emp,group_mean,group_pct10,group_pct25, group_median, group_pct75, group_pct90]

#convert it all to strings so we can combine them all when writing to file

row_string = [str(x) for x in row]

return row_string

except:

# if it doesnt work for a particular title/state just throw it out, there are enough to make this insignificant

'do nothing'

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值