共享单车数据爬取_共享单车数据分析

最新推荐文章于 2024-04-24 09:56:53 发布

weixin_39945531

最新推荐文章于 2024-04-24 09:56:53 发布

阅读量717

点赞数 1

文章标签：共享单车数据爬取

import pandas as pd
import numpy as np

CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }
 
CHICAGO = pd.read_csv(CITY_DATA.get('chicago'))
NEW_YORK_CITY = pd.read_csv(CITY_DATA.get('new york city'))
WASHINGTON = pd.read_csv(CITY_DATA.get('washington'))

# TO DO 选择月份
def select_month(month,df):    
 
    if(month!='all'):
        df['Start Time'] = pd.to_datetime(df['Start Time'])
        months = ['january','february','march','april','may','june']
        month = months.index(month)+1
        df = df[df['Start Time'].dt.month==month]
 
    return df


# TO DO  选择某一天的
def select_day(day,df):
    if(day!='all'):
        df['Start Time'] = pd.to_datetime(df['Start Time'])
        df=df[df['Start Time'].dt.weekday_name == day]
 
    return df


# TO DO 按照用户输入的条件筛选数据
def select_data(city, month ,day):
    print("city: {}, month: {}, day:{} ".format(city, month ,day))
    # 在函数内访问全局变量需要用 global 关键字
    global CITY_DATA
    df_city=''
    #df_city = pd.read_csv(city+'.csv')
    # TO DO 选择的城市
    df_city = pd.read_csv(CITY_DATA.get(city))

    if(month!='all'):
        df_city = select_month(month,df_city)
    if(day!='All'):
        df_city = select_day(day,df_city)
 
    return df_city
 
 
 
# TO DO Start Time 列中哪个月份最常见           
def startt(df):
 
    startcount = pd.to_datetime(df['Start Time']).dt.month
    print("在开始时间中最常见的月份是：{} 月份".format(startcount.value_counts().index[0]))
    return  
 
 

# TO DO Start Time 列中，一周中的哪一天最常见
def weekdayc(df):    
    weekday = pd.to_datetime(df['Start Time']).dt.weekday_name    
    print("在开始时间中最常见的一周中的一天是：{}".format(weekday.value_counts().index[0]))
    return 
 
 

# TO DO •	起始时间中，一天当中哪个小时最常见？
def hourc(df):
    hours = pd.to_datetime(df['Start Time']).dt.hour
    print("在开始时间中最常见的一天中最常见的小时是：{}".format(hours.value_counts().index[0]))
 

 
# TO DO 计算总骑行时长以及平均骑行时长
def caculate_trip(df):    
    # 总骑行时长
    print("总骑行时长是：{}".format(df['Trip Duration'].sum()))    
    # 平均骑行时长
    print("平均骑行时长是：{}".format(df['Trip Duration'].mean()))
 

# TO DO 最经常出现的 开始车站 Start Station
def start_station(df):  
    print("最经常出现的开始车站是：{}".format(df['Start Station'].value_counts().index[0]) )
 
 
 
# TO DO 最经常出现的 结束车站 End Station
def end_station(df):
    print("最热的结束车站是：{}".format(df['End Station'].value_counts().index[0]))
 
 
 
# TO DO 计算最热门路线 ，从开始车站到结束车站
def station_line(df):    
    #lines ="from "+df['End Station']+" to "+df['Start Station']
    #print("最热门的路线：{}".format(lines.value_counts().index[0])) 
    lines = df.groupby(['Start Station', 'End Station']).size().idxmax()
    print("最热门的路线是从 {} 到 {}".format(lines[0],lines[1])) 
 


# TO DO •	每种用户类型有多少人
def user_kinds_counts(df):
    print("每种用户人数:n")
    print(df['User Type'].value_counts())   
 
 

# TO DO 每种性别人数
def sex_kinds_counts(df):
    if 'Gender' in df:
        print("每种性别人数:")
        print(df['Gender'].value_counts())
    else:
        print(" sorry,this city has NO DATA")


 
# TO DO 出生年份，最早，最晚，最常见统计
def birthyear_statistics(df):
    if 'Birth Year' in df:
        lastyear = df['Birth Year'].max()
        earlyyear = df['Birth Year'].min()
        yearoften = df['Birth Year'].value_counts().index[0]
        print("出生最早的年份是：{}".format(earlyyear))
        print("出生最晚的年份是：{}".format(lastyear))
        print("出生常见的年份是：{}".format(yearoften)) 
    else:
        print(" sorry,this city has NO DATA ")  
 
 

def main():  
 
    # TO DO 判断是否筛选一个城市的数据
    while (True):
        result = ''
        select_city = ''
        select_month = ''
        select_day = ''        
        print("input '0' exits ,enter '1' go on")
        if(input()=='0'):
            break
        else:
            # TO DO select city
            while select_city not in ['chicago','new york city','washington']:
                print("select a city,please input one of them: 'chicago' , 'new york city' or 'washington' : ")
                select_city = input().lower()
                if select_city in ['chicago','new york city','washington']:
                    break;
 
            # TO DO select month
            while select_month not in ['january','february','march','april','may', 'june','all']:
                print("select a month,please input one of them: january ,february,march,april,may,june or all :")
                select_month = input().lower()
                if select_month in ['january','february','march','april','may', 'june','all']:
                    break           
 
            # TO DO select day
            while select_day not in ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday','All']:
                print("select one day,please input one of them:Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday or all:")
                select_day = input().title()
                if select_day in ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday','All']:
                    break
 
 
 
        # TO DO 根据用户输入筛选数据
        result = select_data(select_city, select_month ,select_day)
 
        # TO DO 统计数据
        startt(result)
        weekdayc(result)
        hourc(result)
        caculate_trip(result)
        start_station(result)
        end_station(result)
        station_line(result)
        user_kinds_counts(result)
        sex_kinds_counts(result)
        birthyear_statistics(result)


 
if __name__ == "__main__":
    main()

weixin_39945531

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
共享单车数据爬取_共享单车数据分析

import pandas as pdimport numpy as npCITY_DATA = { 'chicago': 'chicago.csv', 'new york city': 'new_york_city.csv', 'washington': 'washington.csv' } CHICAGO = pd.read_c...
复制链接

扫一扫