import numpy as np
import pandas as pd
from pandas import Series, DataFrame
加载文本数据
pd.read_csv() pd.read_table() # deprecated在当前版本过时 pd.read_json()
pd. read_csv( 'data/adults.txt' )
age workclass final_weight education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country salary 0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K 1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K 2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K 3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K 4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 32556 27 Private 257302 Assoc-acdm 12 Married-civ-spouse Tech-support Wife White Female 0 0 38 United-States <=50K 32557 40 Private 154374 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 United-States >50K 32558 58 Private 151910 HS-grad 9 Widowed Adm-clerical Unmarried White Female 0 0 40 United-States <=50K 32559 22 Private 201490 HS-grad 9 Never-married Adm-clerical Own-child White Male 0 0 20 United-States <=50K 32560 52 Self-emp-inc 287927 HS-grad 9 Married-civ-spouse Exec-managerial Wife White Female 15024 0 40 United-States >50K
32561 rows × 15 columns
pd. read_csv( 'data/seeds.tsv' )
15.26\t14.84\t0.871\t5.763\t3.312\t2.221\t5.22\tKama 0 14.88\t14.57\t0.8811\t5.554\t3.333\t1.018\t4.9... 1 14.29\t14.09\t0.905\t5.291\t3.337\t2.699\t4.82... 2 13.84\t13.94\t0.8955\t5.324\t3.379\t2.259\t4.8... 3 16.14\t14.99\t0.9034\t5.658\t3.562\t1.355\t5.1... 4 14.38\t14.21\t0.8951\t5.386\t3.312\t2.462\t4.9... ... ... 204 12.19\t13.2\t0.8783\t5.137\t2.981\t3.631\t4.87... 205 11.23\t12.88\t0.8511\t5.14\t2.795\t4.325\t5.00... 206 13.2\t13.66\t0.8883\t5.236\t3.232\t8.315\t5.05... 207 11.84\t13.21\t0.8521\t5.175\t2.836\t3.598\t5.0... 208 12.3\t13.34\t0.8684\t5.243\t2.974\t5.637\t5.06...
209 rows × 1 columns
pd. read_csv( 'data/seeds.tsv' , sep= '\t' , header= None )
0 1 2 3 4 5 6 7 0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 Kama 1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 Kama 2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 Kama 3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 Kama 4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 Kama ... ... ... ... ... ... ... ... ... 205 12.19 13.20 0.8783 5.137 2.981 3.631 4.870 Canadian 206 11.23 12.88 0.8511 5.140 2.795 4.325 5.003 Canadian 207 13.20 13.66 0.8883 5.236 3.232 8.315 5.056 Canadian 208 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 Canadian 209 12.30 13.34 0.8684 5.243 2.974 5.637 5.063 Canadian
210 rows × 8 columns
pd. read_table( 'data/seeds.tsv' , header= None )
0 1 2 3 4 5 6 7 0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 Kama 1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 Kama 2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 Kama 3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 Kama 4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 Kama ... ... ... ... ... ... ... ... ... 205 12.19 13.20 0.8783 5.137 2.981 3.631 4.870 Canadian 206 11.23 12.88 0.8511 5.140 2.795 4.325 5.003 Canadian 207 13.20 13.66 0.8883 5.236 3.232 8.315 5.056 Canadian 208 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 Canadian 209 12.30 13.34 0.8684 5.243 2.974 5.637 5.063 Canadian
210 rows × 8 columns
pd. read_csv( 'data/usa_election.txt' , low_memory= False )
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-13-ce248bd0bf3d> in <module>
----> 1 pd.read_csv('data/usa_election.txt',low_memory=False)
NameError: name 'pd' is not defined
pd. read_csv( 'data/seeds.tsv' , sep= '\t' , header= None ) . to_json( 'data/seeds.json' )
pd. read_json( 'data/seeds.json' )
0 1 2 3 4 5 6 7 0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 Kama 1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 Kama 2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 Kama 3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 Kama 4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 Kama ... ... ... ... ... ... ... ... ... 205 12.19 13.20 0.8783 5.137 2.981 3.631 4.870 Canadian 206 11.23 12.88 0.8511 5.140 2.795 4.325 5.003 Canadian 207 13.20 13.66 0.8883 5.236 3.232 8.315 5.056 Canadian 208 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 Canadian 209 12.30 13.34 0.8684 5.243 2.974 5.637 5.063 Canadian
210 rows × 8 columns
pd. read_json( 'data/seeds.json' ) . reset_index( )
index 0 1 2 3 4 5 6 7 0 0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 Kama 1 1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 Kama 2 2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 Kama 3 3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 Kama 4 4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 Kama ... ... ... ... ... ... ... ... ... ... 205 205 12.19 13.20 0.8783 5.137 2.981 3.631 4.870 Canadian 206 206 11.23 12.88 0.8511 5.140 2.795 4.325 5.003 Canadian 207 207 13.20 13.66 0.8883 5.236 3.232 8.315 5.056 Canadian 208 208 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 Canadian 209 209 12.30 13.34 0.8684 5.243 2.974 5.637 5.063 Canadian
210 rows × 9 columns
pd. read_json( 'data/seeds.json' ) . reset_index( ) . sort_values( 'index' ) . set_index( 'index' )
0 1 2 3 4 5 6 7 index 0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 Kama 1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 Kama 2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 Kama 3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 Kama 4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 Kama ... ... ... ... ... ... ... ... ... 205 12.19 13.20 0.8783 5.137 2.981 3.631 4.870 Canadian 206 11.23 12.88 0.8511 5.140 2.795 4.325 5.003 Canadian 207 13.20 13.66 0.8883 5.236 3.232 8.315 5.056 Canadian 208 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 Canadian 209 12.30 13.34 0.8684 5.243 2.974 5.637 5.063 Canadian
210 rows × 8 columns
加载excel的数据
pd. read_excel( 'data/students.xlsx' )
id name age sex height(cm) weight(kg) 0 1 吴果 20 男 165 50 1 2 李平 25 女 158 45 2 3 王和 21 男 179 75 3 4 刘言 27 男 180 80 4 5 王平 20 女 164 82 5 6 宁国柱 19 男 178 85 6 7 刘佳敏 25 女 183 90 7 8 张先 30 女 165 55 8 9 刘四七 27 女 170 61 9 10 关小弟 29 男 190 94
pd. read_excel( 'data/students.xlsx' , index_col= 'id' )
name age sex height(cm) weight(kg) id 1 吴果 20 男 165 50 2 李平 25 女 158 45 3 王和 21 男 179 75 4 刘言 27 男 180 80 5 王平 20 女 164 82 6 宁国柱 19 男 178 85 7 刘佳敏 25 女 183 90 8 张先 30 女 165 55 9 刘四七 27 女 170 61 10 关小弟 29 男 190 94
加载网络数据
pd.read_csv()/table()/json()都可以加载网络资源,主要看数据的结构
在data文件夹下,新建data_IO文件夹,添加数据 按windows+R,输入cmd,回车,执行下面代码: 启动后,打开页面,可以看到,数据可以加载:
然后在jupyter notebook里面继续操作:
df = pd. read_csv( 'http://127.0.0.1:8000/testSet.txt' )
df
-0.017612\t14.053064\t0 0 -1.395634\t4.662541\t1 1 -0.752157\t6.538620\t0 2 -1.322371\t7.152853\t0 3 0.423363\t11.054677\t0 4 0.406704\t7.067335\t1 ... ... 94 0.677983\t2.556666\t1 95 0.761349\t10.693862\t0 96 -2.168791\t0.143632\t1 97 1.388610\t9.341997\t0 98 0.317029\t14.739025\t0
99 rows × 1 columns
数据库加载
加载sqlite3数据库
import sqlite3
conn = sqlite3. connect( 'data/person.db' )
cursor = conn. cursor( )
cursor. execute( 'create table tb_person(id integer primary key,name,age,sex,phone)' )
<sqlite3.Cursor at 0x5cb0020>
help ( cursor. execute)
Help on built-in function execute:
execute(...) method of sqlite3.Cursor instance
Executes a SQL statement.
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Jack',18,'男','12123423434')" )
<sqlite3.Cursor at 0x5cb0020>
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Blus',18,'女','12123423544')" )
<sqlite3.Cursor at 0x5cb0020>
cursor. execute( 'select * from tb_person' )
list ( cursor. fetchall( ) )
[(1, 'Jack', 18, '男', '12123423434'), (2, 'Blus', 18, '女', '12123423544')]
conn. close( )
conn = sqlite3. connect( 'data/person.db' )
pd. read_sql( 'select * from tb_person' , conn)
cursor = conn. cursor( )
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Jack',18,'男','12123423434')" )
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Blus',18,'女','12123423544')" )
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Blua',18,'女','12123423544')" )
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Blux',18,'女','12123423544')" )
conn. commit( )
pd. read_sql( 'select * from tb_person' , conn)
id name age sex phone 0 1 Jack 18 男 12123423434 1 2 Blus 18 女 12123423544 2 3 Blua 18 女 12123423544 3 4 Blux 18 女 12123423544
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Blan',22,'女','2123423784')" )
cursor. execute( "insert into tb_person(name,age,sex,phone) values ('Blro',21,'女','2125453784')" )
conn. commit( )
pd. read_sql( 'select * from tb_person' , conn)
id name age sex phone 0 1 Jack 18 男 12123423434 1 2 Blus 18 女 12123423544 2 3 Blua 18 女 12123423544 3 4 Blux 18 女 12123423544 4 5 Blan 22 女 2123423784 5 6 Blro 21 女 2125453784
pd. read_sql( "select * from tb_person where phone like '12%'" , conn)
id name age sex phone 0 1 Jack 18 男 12123423434 1 2 Blus 18 女 12123423544 2 3 Blua 18 女 12123423544 3 4 Blux 18 女 12123423544
df
-0.017612\t14.053064\t0 0 -1.395634\t4.662541\t1 1 -0.752157\t6.538620\t0 2 -1.322371\t7.152853\t0 3 0.423363\t11.054677\t0 4 0.406704\t7.067335\t1 ... ... 94 0.677983\t2.556666\t1 95 0.761349\t10.693862\t0 96 -2.168791\t0.143632\t1 97 1.388610\t9.341997\t0 98 0.317029\t14.739025\t0
99 rows × 1 columns
df. to_sql( 'test_set' , conn)
pd. read_sql( 'select * from test_set' , conn)
index -0.017612\t14.053064\t0 0 0 -1.395634\t4.662541\t1 1 1 -0.752157\t6.538620\t0 2 2 -1.322371\t7.152853\t0 3 3 0.423363\t11.054677\t0 4 4 0.406704\t7.067335\t1 ... ... ... 94 94 0.677983\t2.556666\t1 95 95 0.761349\t10.693862\t0 96 96 -2.168791\t0.143632\t1 97 97 1.388610\t9.341997\t0 98 98 0.317029\t14.739025\t0
99 rows × 2 columns
pd. read_sql( 'select * from test_set' , conn, index_col= 'index' )
-0.017612\t14.053064\t0 index 0 -1.395634\t4.662541\t1 1 -0.752157\t6.538620\t0 2 -1.322371\t7.152853\t0 3 0.423363\t11.054677\t0 4 0.406704\t7.067335\t1 ... ... 94 0.677983\t2.556666\t1 95 0.761349\t10.693862\t0 96 -2.168791\t0.143632\t1 97 1.388610\t9.341997\t0 98 0.317029\t14.739025\t0
99 rows × 1 columns