数据分析工具Pandas
Pandas的数据结构分析
Series
import pandas as pd
ser_obj = pd. Series( [ 1 , 2 , 3 , 4 ] )
print ( ser_obj)
0 1
1 2
2 3
3 4
dtype: int64
ser_obj = pd. Series( [ 1 , 2 , 3 , 4 ] , index= [ 'a' , 'b' , 'c' , 'd' ] )
print ( ser_obj)
a 1
b 2
c 3
d 4
dtype: int64
year_data = { 2002 : 11 , 2023 : 1 , 2005 : 17 }
year_obj = pd. Series( year_data)
year_obj
2002 11
2023 1
2005 17
dtype: int64
year_obj. index
Int64Index([2002, 2023, 2005], dtype='int64')
year_obj. values
array([11, 1, 17], dtype=int64)
year_obj[ 2002 ]
11
DataFrame
demo_arr = np. array( [ [ 1 , 2 , 3 , 4 ] , [ 'a' , 'b' , 'c' , 'd' ] ] )
demo_arr_obj = pd. DataFrame( demo_arr)
demo_arr_obj
demo_arr_obj[ 2 ]
0 3
1 c
Name: 2, dtype: object
demo_arr_obj. 2
File "C:\Users\lenovo\AppData\Local\Temp\ipykernel_8256\527398533.py", line 1
demo_arr_obj.2
^
SyntaxError: invalid syntax
demo_arr_obj[ 'No4' ] = [ 2 , 'k' ]
demo_arr_obj
element = demo_arr_obj[ 'No4' ] [ 0 ]
print ( type ( element) )
<class 'int'>
del demo_arr_obj[ 0 ]
demo_arr_obj
Pandas索引操作及高级索引
索引对象
重置索引
import pandas as pd
ser_obj = pd. Series( [ 1 , 2 , 3 , 4 , 5 ] , index= [ 'c' , 'd' , 'a' , 'b' , 'e' ] )
ser_obj
c 1
d 2
a 3
b 4
e 5
dtype: int64
ser_obj2 = ser_obj. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' ] )
ser_obj2
a 3.0
b 4.0
c 1.0
d 2.0
e 5.0
f NaN
dtype: float64
ser_obj2 = ser_obj. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' ] , fill_value= 0 )
ser_obj2
a 3
b 4
c 1
d 2
e 5
f 0
dtype: int64
ser_obj3 = ser_obj2. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' ] , method= 'ffill' )
ser_obj3
a 3
b 4
c 1
d 2
e 5
f 0
g 0
dtype: int64
ser_obj2
a 3
b 4
c 1
d 2
e 5
f 0
dtype: int64
索引操作
Series 的索引操作
import pandas as pd
arr1 = np. arange( 1 , 6 , 1 )
ser_obj = pd. Series( arr1, index= [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
ser_obj
a 1
b 2
c 3
d 4
e 5
dtype: int32
ser_obj[ 2 ]
3
ser_obj[ 'e' ]
5
ser_obj[ 2 : 4 ]
c 3
d 4
dtype: int32
ser_obj[ [ 0 , 2 , 4 ] ]
a 1
c 3
e 5
dtype: int32
ser_bool = ser_obj> 2
ser_bool
a False
b False
c True
d True
e True
dtype: bool
ser_obj[ ser_bool]
c 3
d 4
e 5
dtype: int32
DataFrame的索引操作
arr = np. arange( 12 ) . reshape( ( 3 , 4 ) )
df_obj = pd. DataFrame( arr, columns= [ 'a' , 'b' , 'c' , 'd' ] )
df_obj
df_obj[ 'c' ]
0 2
1 6
2 10
Name: c, dtype: int32
df_obj[ [ 'c' , 'a' ] ]
df_obj[ 0 : 1 ]
df_obj[ : 3 ] [ [ 'a' , 'c' ] ]
使用Pandas提供的方法操作索引
df_obj. loc[ 1 : 2 , [ 'b' , 'c' ] ]
df_obj. iloc[ 1 : 2 , 1 : 2 ]
df_obj. iloc[ 1 : 2 , [ 1 , 2 ] ]
算术运算与数据对齐
obj1 = pd. Series( range ( 1 , 4 ) , index= range ( 3 ) )
obj1
0 1
1 2
2 3
dtype: int64
obj2 = pd. Series( range ( 10 , 16 ) , index= range ( 6 ) )
obj2
0 10
1 11
2 12
3 13
4 14
5 15
dtype: int64
obj1 + obj2
0 11.0
1 13.0
2 15.0
3 NaN
4 NaN
5 NaN
dtype: float64
obj1. add( obj2, fill_value= 0 )
0 11.0
1 13.0
2 15.0
3 13.0
4 14.0
5 15.0
dtype: float64
数据排序
按索引排序
ser_obj2 = pd. Series( range ( 7 , 20 , 3 ) , index= range ( 5 ) )
ser_obj2
0 7
1 10
2 13
3 16
4 19
dtype: int64
ser_obj2. sort_index( ascending= False )
4 19
3 16
2 13
1 10
0 7
dtype: int64
ser_obj2[ 5 ] = 7
ser_obj2
0 7
1 10
2 13
3 16
4 19
5 7
dtype: int64
按值排序
ser_obj2. sort_values( )
0 7
5 7
1 10
2 13
3 16
4 19
dtype: int64
ser_obj2
0 7
1 10
2 13
3 16
4 19
5 7
dtype: int64
ser_obj2. sort_values( inplace= True )
ser_obj2
0 7
5 7
1 10
2 13
3 16
4 19
dtype: int64
data = np. arange( 1 , 13 , 1 ) . reshape( 3 , 4 )
df_obj2 = pd. DataFrame( data)
df_obj2
df_obj2. sort_values( by= 2 , ascending= False )
统计计算与描述
常用的统计计算
df_obj2. sum ( )
0 15
1 18
2 21
3 24
dtype: int64
df_obj2. sum ( axis= 1 )
0 10
1 26
2 42
dtype: int64
df_obj2. head( 1 )
统计描述
df_obj2. describe( ) . loc[ [ 'std' ] , 0 : 3 ]
层次化索引
认识层次化索引
①在构造方法的index参数中上传一个嵌套列表
multiindex_ser_obj = pd. Series( range ( 145875 , 746431 , 80456 ) , index= [ [ 'a' , 'a' , 'a' , 'a' , 'b' , 'b' , 'b' , 'b' ] , [ 'aa' , 'ab' , 'ac' , 'ad' , 'ba' , 'bb' , 'bc' , 'bd' ] ] )
multiindex_ser_obj
a aa 145875
ab 226331
ac 306787
ad 387243
b ba 467699
bb 548155
bc 628611
bd 709067
dtype: int64
multiindex_df_obj = pd. DataFrame( range ( 145875 , 746431 , 80456 ) , index= [ [ 'a' , 'a' , 'a' , 'a' , 'b' , 'b' , 'b' , 'b' ] , [ 'aa' , 'ab' , 'ac' , 'ad' , 'ba' , 'bb' , 'bc' , 'bd' ] ] , columns= [ '占地面积' ] )
multiindex_df_obj
占地面积 a aa 145875 ab 226331 ac 306787 ad 387243 b ba 467699 bb 548155 bc 628611 bd 709067
②通过MultiIndex类的方法构建一个层次化索引
from pandas import MultiIndex
list_tuples = [ ( 'A' , 'A1' ) , ( 'A' , 'A2' ) , ( 'A' , 'A3' ) , ( 'A' , 'A4' ) , ( 'A' , 'A5' ) , ( 'B' , 'B1' ) , ( 'B' , 'B2' ) , ( 'B' , 'B3' ) ]
multiIndex = MultiIndex. from_tuples( tuples= list_tuples, names= [ '外层索引' , '内层索引' ] )
data = np. arange( 1 , 41 , 1 ) . reshape( 8 , 5 )
df_obj3 = pd. DataFrame( data, index= multiIndex)
df_obj3
0 1 2 3 4 外层索引 内层索引 A A1 1 2 3 4 5 A2 6 7 8 9 10 A3 11 12 13 14 15 A4 16 17 18 19 20 A5 21 22 23 24 25 B B1 26 27 28 29 30 B2 31 32 33 34 35 B3 36 37 38 39 40
MIDX = np. array( [ [ 'A' , 'B' , 'A' , 'B' , 'A' , 'A' , 'B' , 'B' ] , [ 'A1' , 'B2' , 'A2' , 'B1' , 'A4' , 'A3' , 'B3' , 'B3' ] ] )
MIDX_obj = MultiIndex. from_arrays( arrays= MIDX, names= [ '外层索引' , '内层索引' ] )
df_obj4 = pd. DataFrame( data, index= MIDX_obj)
df_obj4
0 1 2 3 4 外层索引 内层索引 A A1 1 2 3 4 5 B B2 6 7 8 9 10 A A2 11 12 13 14 15 B B1 16 17 18 19 20 A A4 21 22 23 24 25 A3 26 27 28 29 30 B B3 31 32 33 34 35 B3 36 37 38 39 40
nums = [ 0 , 1 , 2 , 3 ]
colors = [ 'r' , 'g' ]
MPROD = multiIndex. from_product( iterables= [ nums, colors] , names= [ 'nums' , 'colors' ] )
df_obj5 = pd. DataFrame( data, index= MPROD)
df_obj5
0 1 2 3 4 nums colors 0 r 1 2 3 4 5 g 6 7 8 9 10 1 r 11 12 13 14 15 g 16 17 18 19 20 2 r 21 22 23 24 25 g 26 27 28 29 30 3 r 31 32 33 34 35 g 36 37 38 39 40
层次化索引的操作
①选取子集操作
maidx = np. array( [ [ '小说' , '小说' , '小说' , '随笔散文' , '随笔散文' , '随笔散文' , '传记' , '传记' , '传记' ] , [ '高山上的小邮局' , '失踪的总统' , '绿毛水怪' , '皮囊' , '浮生六记' , '自在独行' , '梅西' , '老舍自传' , '库里传' ] ] )
maidx_obj = MultiIndex. from_arrays( maidx, names= [ '书籍类别' , '书籍名称' ] )
nums = np. array( [ 50 , 60 , 40 , 94 , 63 , 101 , 200 , 56 , 45 ] )
book_obj = pd. Series( nums, index= maidx_obj)
book_obj
书籍类别 书籍名称
小说 高山上的小邮局 50
失踪的总统 60
绿毛水怪 40
随笔散文 皮囊 94
浮生六记 63
自在独行 101
传记 梅西 200
老舍自传 56
库里传 45
dtype: int32
book_obj. loc[ [ '传记' ] , : ]
书籍类别 书籍名称
传记 梅西 200
老舍自传 56
库里传 45
dtype: int32
book_obj[ : , '自在独行' ]
书籍类别
随笔散文 101
dtype: int32
② 交换分层索引
book_obj. swaplevel( )
书籍名称 书籍类别
高山上的小邮局 小说 50
失踪的总统 小说 60
绿毛水怪 小说 40
皮囊 随笔散文 94
浮生六记 随笔散文 63
自在独行 随笔散文 101
梅西 传记 200
老舍自传 传记 56
库里传 传记 45
dtype: int32
③排序分层
读写数据操作
读写文本文件
① 通过to_csv()方法将数据写入到CSV文件
df = pd. DataFrame( { 'one' : [ 1 , 2 , 3 ] , 'two' : [ 4 , 5 , 6 ] } )
df. to_csv( r'D:\python数据分析资料\csv文件读写.csv' )
'写入完毕'
'写入完毕'
② 通过pd.read_csv()函数读取CSV文件的数据
file = open ( r'D:\python数据分析资料\csv文件读写.csv' )
file_data = pd. read_csv( file )
file_data
Unnamed: 0 one two 0 0 1 4 1 1 2 5 2 2 3 6
③ read_table()函数
读写excle文件
① 使用to_excle()方法写入excle文件
import numpy as np
import pandas as pd
df1 = pd. DataFrame( { 'col1' : [ '数' , '据' ] , 'col2' : [ '分' , '析' ] } )
df1. to_excel( r'D:\python数据分析资料\数据分析.xlsx' , '数据分析Pandas' )
② 使用read_excel()函数读取Excel文件
data = pd. read_excel( r'D:\python数据分析资料\数据分析.xlsx' , engine= 'xlrd' )
data
---------------------------------------------------------------------------
XLRDError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_14516\723589026.py in <module>
----> 1 data = pd.read_excel(r'D:\python数据分析资料\数据分析.xlsx',engine='xlrd')
2 data
D:\anaconda\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
D:\anaconda\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, decimal, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
455 if not isinstance(io, ExcelFile):
456 should_close = True
--> 457 io = ExcelFile(io, storage_options=storage_options, engine=engine)
458 elif engine and engine != io.engine:
459 raise ValueError(
D:\anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path_or_buffer, engine, storage_options)
1417 self.storage_options = storage_options
1418
-> 1419 self._reader = self._engines[engine](self._io, storage_options=storage_options)
1420
1421 def __fspath__(self):
D:\anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in __init__(self, filepath_or_buffer, storage_options)
23 err_msg = "Install xlrd >= 1.0.0 for Excel support"
24 import_optional_dependency("xlrd", extra=err_msg)
---> 25 super().__init__(filepath_or_buffer, storage_options=storage_options)
26
27 @property
D:\anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, filepath_or_buffer, storage_options)
516 self.handles.handle.seek(0)
517 try:
--> 518 self.book = self.load_workbook(self.handles.handle)
519 except Exception:
520 self.close()
D:\anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in load_workbook(self, filepath_or_buffer)
36 if hasattr(filepath_or_buffer, "read"):
37 data = filepath_or_buffer.read()
---> 38 return open_workbook(file_contents=data)
39 else:
40 return open_workbook(filepath_or_buffer)
D:\anaconda\lib\site-packages\xlrd\__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows, ignore_workbook_corruption)
170 raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
171
--> 172 bk = open_workbook_xls(
173 filename=filename,
174 logfile=logfile,
D:\anaconda\lib\site-packages\xlrd\book.py in open_workbook_xls(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows, ignore_workbook_corruption)
77 t1 = perf_counter()
78 bk.load_time_stage_1 = t1 - t0
---> 79 biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
80 if not biff_version:
81 raise XLRDError("Can't determine file's BIFF version")
D:\anaconda\lib\site-packages\xlrd\book.py in getbof(self, rqd_stream)
1282 bof_error('Expected BOF record; met end of file')
1283 if opcode not in bofcodes:
-> 1284 bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
1285 length = self.get2bytes()
1286 if length == MY_EOF:
D:\anaconda\lib\site-packages\xlrd\book.py in bof_error(msg)
1276
1277 def bof_error(msg):
-> 1278 raise XLRDError('Unsupported format, or corrupt file: ' + msg)
1279 savpos = self._position
1280 opcode = self.get2bytes()
XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'c\x14#e\x08\x00\xf8\x01'
读取HTML表格数据
import pandas as pd
import requests
html_data= requests. get( 'https://www.runoob.com/html/html-tables.html' )
html_table_data= pd. read_html( html_data. content, encoding= 'utf-8' )
html_table_data[ 1 ]
标签 描述 0 <table> 定义表格 1 <th> 定义表格的表头 2 <tr> 定义表格的行 3 <td> 定义表格单元 4 <caption> 定义表格标题 5 <colgroup> 定义表格列的组 6 <col> 定义用于表格列的属性 7 <thead> 定义表格的页眉 8 <tbody> 定义表格的主体 9 <tfoot> 定义表格的页脚
读写数据库
注:连接Mysql数据库时,使用的是mysqlconnector驱动,如果当前环境中没有该模块,需使用pip install mysql-connector 命令安装该模块
① 使用read_sql()函数读取数据
from mysql import connector
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_14516\3642983536.py in <module>
----> 1 from mysql import connector
ModuleNotFoundError: No module named 'mysql'
pip install mysql- connector
^C
Note: you may need to restart the kernel to use updated packages.
from sqlalchemy import create_engine
from mysql import connector
from mysql import connector
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine( 'mysql+mysqlconnector://root:aa123456'
'@127.0.0.1:3306/sakila' )
sql = 'SELECT * FROM actor WHERE actor_id<10;'
pd. read_sql( sql, engine)
actor_id first_name last_name last_update 0 1 PENELOPE GUINESS 2006-02-15 04:34:33 1 2 NICK WAHLBERG 2006-02-15 04:34:33 2 3 ED CHASE 2006-02-15 04:34:33 3 4 JENNIFER DAVIS 2006-02-15 04:34:33 4 5 JOHNNY LOLLOBRIGIDA 2006-02-15 04:34:33 5 6 BETTE NICHOLSON 2006-02-15 04:34:33 6 7 GRACE MOSTEL 2006-02-15 04:34:33 7 8 MATTHEW JOHANSSON 2006-02-15 04:34:33 8 9 JOE SWANK 2006-02-15 04:34:33
② 使用to_sql()方法将数据写入到数据库中
create database students_info charset= utf- 8
File "C:\Users\lenovo\AppData\Local\Temp\ipykernel_11032\3890836627.py", line 1
create database students_info charset=utf-8
^
SyntaxError: invalid syntax
import pandas as pd
from pandas import DataFrame, Series
from sqlalchemy. types import *
from sqlalchemy import create_engine
from mysql import connector
df = DataFrame( { '班级' : [ '一年级' , '二年级' , '三年级' , '四年级' ] , '男生人数' : [ 25 , 23 , 27 , 30 ] , '女生人数' : [ 19 , 17 , 20 , 20 ] } )
engine = create_engine( 'mysql+mysqlconnector://root:aa123456'
'@127.0.0.1:3306/data_analysis_test' )
df. to_sql( 'students' , engine)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_20440\1417353508.py in <module>
7 engine = create_engine('mysql+mysqlconnector://root:aa123456'
8 '@127.0.0.1:3306/data_analysis_test')
----> 9 df.to_sql('students',engine)
D:\anaconda\lib\site-packages\pandas\core\generic.py in to_sql(self, name, con, schema, if_exists, index, index_label, chunksize, dtype, method)
2949 from pandas.io import sql
2950
-> 2951 return sql.to_sql(
2952 self,
2953 name,
D:\anaconda\lib\site-packages\pandas\io\sql.py in to_sql(frame, name, con, schema, if_exists, index, index_label, chunksize, dtype, method, engine, **engine_kwargs)
696 )
697
--> 698 return pandas_sql.to_sql(
699 frame,
700 name,
D:\anaconda\lib\site-packages\pandas\io\sql.py in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype, method, engine, **engine_kwargs)
1730 sql_engine = get_engine(engine)
1731
-> 1732 table = self.prep_table(
1733 frame=frame,
1734 name=name,
D:\anaconda\lib\site-packages\pandas\io\sql.py in prep_table(self, frame, name, if_exists, index, index_label, schema, dtype)
1629 dtype=dtype,
1630 )
-> 1631 table.create()
1632 return table
1633
D:\anaconda\lib\site-packages\pandas\io\sql.py in create(self)
832 if self.exists():
833 if self.if_exists == "fail":
--> 834 raise ValueError(f"Table '{self.name}' already exists.")
835 elif self.if_exists == "replace":
836 self.pd_sql.drop_table(self.name, self.schema)
ValueError: Table 'students' already exists.
案例
import pandas as pd
import requests
html_data = requests. get( 'https://gaokao.xdf.cn/201805/10784243.html' )
html_table_data = pd. read_html( html_data. content, encoding= 'utf-8' )
score = html_table_data[ 0 ] . loc[ : , 0 : 4 ]
score
0 1 2 3 4 0 年份 一本分数线 一本分数线 二本分数线 二本分数线 1 年份 文科 理科 文科 理科 2 2020 542 537 469 449 3 2019 542 507 481 432 4 2018 546 516 476 432 5 2017 518 481 452 400 6 2016 518 519 460 438 7 2015 513 515 462 442 8 2014 526 534 478 462 9 2013 507 493 459 440 10 2012 539 530 492 476 11 2011 543 570 496 520 12 2010 533 536 492 491 13 2009 548 547 507 505 14 2008 545 546 510 511 15 2007 580 572 553 439 16 2006 561 552 528 515
score. to_csv( 'D:\python数据分析资料\score.csv' )
'写入完成'
'写入完成'
df_obj = pd. read_csv( 'D:\python数据分析资料\score.csv' , header= [ 0 , 1 , 2 ] )
df_obj
Unnamed: 0_level_0 0 1 2 3 4 0 年份 一本分数线 一本分数线 二本分数线 二本分数线 1 年份 文科 理科 文科 理科 0 2 2020 542 537 469 449 1 3 2019 542 507 481 432 2 4 2018 546 516 476 432 3 5 2017 518 481 452 400 4 6 2016 518 519 460 438 5 7 2015 513 515 462 442 6 8 2014 526 534 478 462 7 9 2013 507 493 459 440 8 10 2012 539 530 492 476 9 11 2011 543 570 496 520 10 12 2010 533 536 492 491 11 13 2009 548 547 507 505 12 14 2008 545 546 510 511 13 15 2007 580 572 553 439 14 16 2006 561 552 528 515
import pandas as pd
from pandas import DataFrame, Series
from sqlalchemy. types import *
from sqlalchemy import create_engine
from mysql import connector
engine = create_engine( 'mysql+mysqlconnector://root:aa123456'
'@127.0.0.1:3306/data_analysis_test' )
df_obj. to_sql( 'score' , engine)
'写入完成'
'写入完成'
sorted_obj = pd. read_sql( 'score' , engine)
sorted_obj. max ( )
index None
('Unnamed: 0_level_0', '0', '1') 16
('0', '年份', '年份') 2020
('1', '一本分数线', '文科') 580
('2', '一本分数线', '理科') 572
('3', '二本分数线', '文科') 553
('4', '二本分数线', '理科') 520
dtype: object
sorted_obj. describe( )
('Unnamed: 0_level_0', '0', '1') ('0', '年份', '年份') ('1', '一本分数线', '文科') ('2', '一本分数线', '理科') ('3', '二本分数线', '文科') ('4', '二本分数线', '理科') count 15.000000 15.000000 15.000000 15.000000 15.000000 15.000000 mean 9.000000 2013.000000 537.400000 530.333333 487.666667 463.466667 std 4.472136 4.472136 19.111702 25.819889 28.050125 37.061468 min 2.000000 2006.000000 507.000000 481.000000 452.000000 400.000000 25% 5.500000 2009.500000 522.000000 515.500000 465.500000 438.500000 50% 9.000000 2013.000000 542.000000 534.000000 481.000000 449.000000 75% 12.500000 2016.500000 545.500000 546.500000 501.500000 498.000000 max 16.000000 2020.000000 580.000000 572.000000 553.000000 520.000000