示例:仅有数据列表即可产生最简单的Series
>>> import pandas as pd
>>> import numpy as np
>>> s1 = pd.Series([1,'a',5.2,"x"])
>>> s1
# 左侧为索引,右侧是数据
0 1
1 a
2 5.2
3 x
dtype: object
>>> dir(s1)
#注:很多属性
#注:获取索引
>>> s1.index
RangeIndex(start=0, stop=4, step=1)
#注:获取数据
>>> s1.values
array([1, 'a', 5.2, 'x'], dtype=object)
示例:创建一个具有标签索引的Series
>>> s2 = pd.Series([1, 'a', 5.2, 'x'], index=["d","b","c","aaa"])
>>> s2
d 1
b a
c 5.2
aaa x
dtype: object
#注:Series里面有不同输出格式,对象就是object
>>> s2.index
Index(['d', 'b', 'c', 'aaa'], dtype='object')
示例:使用Python字典创建Series
>>> sdata = {
"a":2000,"b":1500,"c":3000,"d":1200}
>>> s3 = pd.Series(sdata)
>>> s3
a 2000
b 1500
c 3000
d 1200
dtype: int64
>>> s3["a"]
2000
示例:根据标签索引查询数据
#示例:获取值
>>> s2
d 1
b a
c 5.2
aaa x
dtype: object
>>> type(s2["c"])
<class 'float'>
>>> type(s2["d"])
<class 'int'>
>>> type(s2["b"])
<class 'str'>
>>> s2[["c","b"]] #注:获取多个值
c 5.2
b a
dtype: object
示例:根据多个字典序列创建dataframe
>>> data = {
... 'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
... 'year':[2000,2001,2002,2001,2002],
... 'pop':[1.5,1.7,3.6,2.4,2.9]
... }
>>> df = pd.DataFrame(data)
>>> df
state year pop #注:state year pop列索引; 0 1 2 3 4行索引
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
>>> df.dtypes
state object
year int64
pop float64
dtype: object #注:有str,都是object
>>> df.columns #注:列索引
Index(['state', 'year', 'pop'], dtype='object')
>>> df.index #注:行索引
RangeIndex(start=0, stop=5, step=1)
>>> df["year"] #注:提取某列
0 2000
1 2001
2 2002
3 2001
4 2002
Name: year, dtype: int64
>>> type(df["year"]) #注:列类型 series
<class 'pandas.core.series.Series'>
>>> df[["year","state"]] #注:提取多列
year state
0 2000 Ohio
1 2001 Ohio
2 2002 Ohio
3 2001 Nevada
4 2002 Nevada
示例:有str,是object;整型、浮点型,全部强制转成float
>>> s4 = pd.Series([1,2,3,4,5.5])
>>> s4
0 1.0
1 2.0
2 3.0
3 4.0
4 5.5
dtype: float64
>>> s4 = pd.Series([1,2,3,4,5.5,"aa"])
>>> s4
0 1
1 2
2 3
3 4
4 5.5
5 aa
dtype: object
#注:csv文件:轻量级表格形式,默认逗号分割。第1列数据 标题头
示例:根据从文件、mysql中读取创建dataframe
>>> import pandas as pd
>>> df = pd.read_csv("./read_test.csv") #注:读取csv文件
>>> df
date prov isp pv uv
0 2020-04-26 hunan cmnet 2000 1000
1 2020-04-26 hunan cmnet 3000 1500
2 2020-04-26 hunan cmcc 4000 1000
3 2020-04-26 hubei ctc 2500 1000
4 2020-04-26 hubei cmcc 2000 1000
5 2020-04-26 hubei ctc 2100 1600
6 2020-04-27 hunan cmnet 4000 1700
7 2020-04-27 hunan cmnet 3200 1500
8 2020-04-27 hunan cmcc 2800 1600
9 2020-04-27 hubei ctc 2600 1400
10 2020-04-27 hubei cmcc 3800 1900
11 2020-04-27 hubei ctc 2400 1900
>>> df2 = pd.read_excel("./互联网公司股票.xlsx") #注:读取Excel文件
>>> df2
日期 公司 收盘 开盘 高 低 交易量 涨跌幅
0 2019-10-03 BIDU 104.32 102.35 104.73 101.15 2.24 0.02
1 2019-10-02 BIDU 102.62 100.85 103.24 99.50 2.69 0.01
2 2019-10-01 BIDU 102.00 102.80 103.26 101.00 1.78 -0.01
3 2019-10-03 BABA 169.48 166.65 170.18 165.00 10.39 0.02
4 2019-10-02 BABA 165.77 162.82 166.88 161.90 11.60 0.00
5 2019-10-01 BABA 165.15 168.01 168.23 163.64 14.19 -0.01
6 2019-10-03 IQ 16.06 15.71 16.38 15.32 10.08 0.02
7 2019-10-02 IQ 15.72 15.85 15.87 15.12 8.10 -0.01
8 2019-10-01 IQ 15.92 16.14 16.22 15.50 11.65 -0.01
9 2019-10-03 JD 28.80 28.11 28.97 27.82 8.77 0.03
10 2019-10-02 JD 28.06 28.00 28.22 27.53 9.53 0.00
11 2019-10-01 JD 28.19 28.22 28.57 27.97 10.64 0.00
示例:从网页中拉取table数据
>>> import pandas as pd
>>> text_html = pd.read_html('http://ranking.promisingedu.com/qs')[0]
>>> text_html
Ranking University English Name Country/Region ... Citations per Faculty Overall Score Free
0 1 Massachusetts Institute of Technology (MIT) United States ... 99.1 100.0 免费评估
1 2 Stanford University United States ... 98.1 98.4 免费评估
2 3 Harvard University United States ... 99.1 97.9 免费评估
3 4 California Institute of Technology (Caltech) United States ... 99.9 97.0 免费评估
4 5 牛津大学University of Oxford United Kingdom ... 81.3 96.7 免费评估
.. ... ... ... ... ... ... ...
496 493 UNESP Brazil ... 16.2 24.2 免费评估
497 498 Universidad de La Habana Cuba ... 2.4 24.1 免费评估
498 499 Oregon State University United States ... 35.0 23.9 免费评估
499 499 Universidad de Alcalá Spain ... 10.6 23.9 免费评估
500 499 University of Eastern Finland Finland ... 33.5 23.9 免费评估
[501 rows x 11 columns]
示例:从数据库拉取
[root@cPen_A ~]# pip3 install pymysql
#注:mysql环境下
root@(none) 11:14 mysql>create database pd;
root@(none) 11:19 mysql>show databases;
root@(none) 11:20 mysql>use pd;
root@pd 11:20 mysql>create table Student(
-> id int not null,
-> name varchar(64) not null,
-> sex varchar(64),
-> age int);
root@pd 11:22 mysql>show tables;
+--------------+
| Tables_in_pd |
+--------------+
| Student |
+--------------+
1 row in set (0.00 sec)
root@pd 11:23 mysql>desc Student;
+-------+-------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-------+-------------+------+-----+---------+-------+
| id | int(11) | NO | | NULL | |
| name | varchar(64) | NO | | NULL | |
| sex | varchar(64) | YES | | NULL | |
| age | int(11) | YES | | NULL | |
+-------+-------------+------+-----+---------+-------+
4 rows in set (0.18 sec)
root@pd 11:23 mysql>insert into Student values(1,'cp','m','18');
root@pd 11:24 mysql>insert into Student values(2,'cp2','m','18');
root@pd 11:25 mysql>select * from Student;
+----+------+------+------+
| id | name | sex | age |
+----+------+------+------+
| 1 | cp | m | 18 |
| 2 | cp2 | m | 18 |
+----+------+------+------+
2 rows in set (0.00 sec)
#注:Python3环境下
>>> import pandas as pd
>>> import pymysql
>>> conn = pymysql.connect(
... host = "127.0.0.1",
... user = "root",
... passwd = "Sanchuang123#",
... database = "pd",
... charset = "utf8")
>>> mysql_df = pd.read_sql("select * from Student", con=conn)
>>> mysql_df
id name sex age
0 1 cp m 18
1 2 cp2 m 18
>>> mysql_df.values
array([[1, 'cp', 'm', 18], #注:返回二维数组形式
[2, 'cp2', 'm', 18]], dtype=object)
示例:二维数组形式返回
>>> import pandas as pd
>>> df = pd.read_csv("read_test.csv")
>>> df.values #注:二维数组形式
array([['2020-04-26', 'hunan', 'cmnet', 2000, 1000],
['2020-04-26', 'hunan', 'cmnet', 3000, 1500],
['2020-04-26', 'hunan', 'cmcc', 4000, 1000],
['2020-04-26', 'hubei', 'ctc', 2500, 1000],
['2020-04-26', 'hubei', 'cmcc', 2000, 1000],
['2020-04-26', 'hubei', 'ctc', 2100, 1600],
['2020-04-27', 'hunan', 'cmnet', 4000, 1700],
['2020-04-27', 'hunan', 'cmnet', 3200, 1500],
['2020-04-27', 'hunan', 'cmcc', 2800, 1600],
['2020-04-27', 'hubei', 'ctc', 2600, 1400],
['2020-04-27', 'hubei', 'cmcc', 3800, 1900],
['2020-04-27', 'hubei', 'ctc', 2400, 1900]], dtype=object)
示例:返回行列数
>>> df.shape #注:返回行列数
(12, 5)
示例:返回行索引
>>> df.index #注:返回行索引
RangeIndex(start=0, stop=12, step=1)
示例:指定索引列
>>> df.set_index('date') #注:指定索引列
prov isp pv uv
date
2020-04-26 hunan cmnet 2000 1000
2020-04-26 hunan cmnet 3000 1500
2020-04-26 hunan cmcc 4000 1000
2020-04-26 hubei ctc 2500 1000
2020-04-26 hubei cmcc 2000 1000
2020-04-26 hubei ctc 2100 1600
2020-04-27 hunan cmnet 4000 1700
2020-04-27 hunan cmnet 3200 1500
2020-04-27 hunan cmcc 2800 1600
2020-04-27 hubei ctc 2600 1400
2020-04-27 hubei cmcc 3800 1900
2020-04-27 hubei ctc 2400 1900
示例:重置索引
>>> df2.reset_index() #注:重置索引
date prov isp pv uv
0 2020-04-26 hunan cmnet 2000 1000
1 2020-04-26 hunan cmnet 3000 1500
2 2020-04-26 hunan cmcc 4000 1000
3 2020-04-26 hubei ctc 2500 1000
4 2020-04-26 hubei cmcc 2000 1000
5 2020-04-26 hubei ctc 2100 1600
6 2020-04-27 hunan cmnet 4000 1700
7 2020-04-27 hunan cmnet 3200 1500
8 2020-04-27 hunan cmcc 2800 1600
9 2020-04-27 hubei ctc 2600 1400
10 2020-04-27 hubei cmcc 3800 1900
11 2020-04-27 hubei ctc 2400 1900
示例:drop 丢弃老索引,默认false
>>> df2.reset_index(drop=True)