coursera-data analysis and interpretation Week 1- Anova

This is to determine whether the ethnicity and quantity of smoke has relationship with each other

import numpy

import pandas

import statsmodels.formula.api as smf

import statsmodels.stats.multicomp as multi

data= pandas.read_csv('nesarc.csv', low_memory=False)

#settingvariables you will be working with to numeric

data['S3AQ3B1']= data['S3AQ3B1'].convert_objects(convert_numeric=True) data['S3AQ3C1'] =data['S3AQ3C1'].convert_objects(convert_numeric=True)

data['CHECK321']= data['CHECK321'].convert_objects(convert_numeric=True)

#subsetdata to young adults age 18 to 25 who have smoked in the past 12 months

sub1=data[(data['AGE']>=18)& (data['AGE']<=25) & (data['CHECK321']==1)]

#SETTINGMISSING DATA

sub1['S3AQ3B1']=sub1['S3AQ3B1'].replace(9,numpy.nan)

sub1['S3AQ3C1']=sub1['S3AQ3C1'].replace(99,numpy.nan)

#recodingnumber of days smoked in the past month

recode1= {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}

sub1['USFREQMO']=sub1['S3AQ3B1'].map(recode1)

#convertingnew variable USFREQMMO to numeric

sub1['USFREQMO']=sub1['USFREQMO'].convert_objects(convert_numeric=True)

#Creating a secondary variable multiplying the days smoked/month and the numberof cig/per day

sub1['NUMCIGMO_EST']=sub1['USFREQMO']* sub1['S3AQ3C1']

sub1['NUMCIGMO_EST']=sub1['NUMCIGMO_EST'].convert_objects(convert_numeric=True)

ct1= sub1.groupby('NUMCIGMO_EST').size()

print(ct1)

#using ols function for calculating the F-statistic and associated p value

model1= smf.ols(formula='NUMCIGMO_EST ~ C(MAJORDEPLIFE)', data=sub1)

results1= model1.fit()

print(results1.summary())

sub2= sub1[['NUMCIGMO_EST', 'MAJORDEPLIFE']].dropna()

print('means for numcigmo_est by major depression status')

m1=sub2.groupby('MAJORDEPLIFE').mean()

print(m1)

print('standard deviations for numcigmo_est by major depression status')

sd1= sub2.groupby('MAJORDEPLIFE').std()

print(sd1)

#iwill call it sub3

sub3= sub1[['NUMCIGMO_EST', 'ETHRACE2A']].dropna()

model2= smf.ols(formula='NUMCIGMO_EST ~ C(ETHRACE2A)', data=sub3).fit()

print(model2.summary())

print('means for numcigmo_est by major depression status')

m2=sub3.groupby('ETHRACE2A').mean()

print(m2)

print('standard deviations for numcigmo_est by major depression status')

sd2= sub3.groupby('ETHRACE2A').std()

print(sd2)

mc1= multi.MultiComparison(sub3['NUMCIGMO_EST'], sub3['ETHRACE2A'])

res1= mc1.tukeyhsd()

print(res1.summary())

Then we can get the F value of 25.08 and p=0.000018, it is safe to reject the null hypothesis.



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值